示例#1
0
    def test_match_multiple(self):

        ms = MotifSet(preload_motifs="default")
        ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact")

        self.assertEqual(len(ms), 1)

        motif = ms.get_motif_list(1, 0.0001)[0]

        scanner = scan.Scanner(7)

        pssm_list, thresholds = [], []

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
    def test_match_multiple(self):
        dirname = os.path.dirname(__file__)
        jasp_dir = "../../data/motifs/jaspar_vertebrates/"

        scanner = scan.Scanner(7)

        pssm_list = []
        thresholds = []

        motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None)

        thresholds.append(motif.threshold)
        thresholds.append(motif.threshold_rc)
        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

        bg = tools.flat_bg(4)
        scanner.set_motifs(pssm_list, bg, thresholds)

        genomic_region = GenomicRegion("chr1", 0, 5022)

        # Reading sequence associated to genomic_region
        sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final))

        grs = match_multiple(scanner, [motif], sequence, genomic_region)

        self.assertSequenceEqual(grs.sequences,
                                 [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"),
                                  GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
示例#3
0
def main(args):
    """
    Performs motif matching.
    """

    ###################################################################################################
    # Processing Input Arguments
    ###################################################################################################

    # Initializing Error Handler
    err = ErrorHandler()

    # Additional Parameters
    matching_folder_name = "match"
    random_region_name = "random_regions"

    filter_values = parse_filter(args.filter)

    ###################################################################################################
    # Initializations
    ###################################################################################################

    # Output folder
    if args.output_location:
        output_location = args.output_location
    else:
        output_location = npath(matching_folder_name)
    print(">> output location:", output_location)

    # Default genomic data
    genome_data = GenomeData(args.organism)

    print(">> genome:", genome_data.organism)
    print(">> pseudocounts:", args.pseudocounts)
    print(">> fpr threshold:", args.fpr)

    ###################################################################################################
    # Reading Input Regions
    ###################################################################################################

    genomic_regions_dict = {}

    # get experimental matrix, if available
    if args.input_matrix:
        try:
            exp_matrix = ExperimentalMatrix()
            exp_matrix.read(args.input_matrix)

            # if the matrix is present, the (empty) dictionary is overwritten
            genomic_regions_dict = exp_matrix.objectsDict

            print(">>> experimental matrix loaded")

        except Exception:
            err.throw_error("MM_WRONG_EXPMAT")
    elif args.input_files:
        # get input files, if available
        for input_filename in args.input_files:
            name, _ = os.path.splitext(os.path.basename(input_filename))

            regions = GenomicRegionSet(name)
            regions.read(npath(input_filename))

            genomic_regions_dict[name] = regions

            print(">>> input file", name, "loaded:", len(regions), "regions")

    # we put this here because we don't want to create the output directory unless we
    # are sure the initialisation (including loading input files) worked
    try:
        if not os.path.isdir(output_location):
            os.makedirs(output_location)
    except Exception:
        err.throw_error("MM_OUT_FOLDER_CREATION")

    annotation = None
    target_genes = None
    # get promoter regions from list of genes (both target and background)
    # TODO: should be more clever, allow precomputed regions etc
    if args.target_genes_filename:
        annotation = AnnotationSet(args.organism,
                                   alias_source=args.organism,
                                   protein_coding=True,
                                   known_only=True)

        target_genes = GeneSet("target_genes")
        target_genes.read(args.target_genes_filename)

        # TODO: what do we do with unmapped genes? maybe just print them out
        target_regions = annotation.get_promoters(
            gene_set=target_genes, promoter_length=args.promoter_length)
        target_regions.name = "target_regions"
        target_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, target_regions.name + ".bed"))
        target_regions.write(output_file_name)

        genomic_regions_dict[target_regions.name] = target_regions

        print(">>> target promoter file created:", len(target_regions),
              "regions")

    # we make a background in case it's requested, but also in case a list of target genes has not been
    # provided
    if args.promoter_make_background or (args.promoters_only
                                         and not args.target_genes_filename):
        if not annotation:
            annotation = AnnotationSet(args.organism,
                                       alias_source=args.organism,
                                       protein_coding=True,
                                       known_only=True)

        # background is made of all known genes minus the target genes (if any)
        background_genes = GeneSet("background_genes")
        background_genes.get_all_genes(organism=args.organism)

        if target_genes:
            background_genes.subtract(target_genes)

        background_regions = annotation.get_promoters(
            gene_set=background_genes, promoter_length=args.promoter_length)
        background_regions.name = "background_regions"
        background_regions.sort()
        output_file_name = npath(
            os.path.join(output_location, background_regions.name + ".bed"))
        background_regions.write(output_file_name)

        genomic_regions_dict[background_regions.name] = background_regions

        print(">>> background promoter file created:", len(background_regions),
              "regions")

    if not genomic_regions_dict:
        err.throw_error(
            "DEFAULT_ERROR",
            add_msg=
            "You must either specify an experimental matrix, or at least a "
            "valid input file, or one of the 'promoter test' options.")

    max_region_len = 0
    max_region = None
    regions_to_match = []

    # Iterating on experimental matrix objects
    for k in genomic_regions_dict.keys():

        curr_genomic_region = genomic_regions_dict[k]

        # If the object is a GenomicRegionSet
        if isinstance(curr_genomic_region, GenomicRegionSet):

            if args.rmdup:
                # remove duplicates and sort regions
                curr_genomic_region.remove_duplicates(sort=True)
            else:
                # sort regions
                curr_genomic_region.sort()

            # Append label and GenomicRegionSet
            regions_to_match.append(curr_genomic_region)

            # Verifying max_region_len for random region generation
            curr_len = len(curr_genomic_region)
            if curr_len > max_region_len:
                max_region_len = curr_len
                max_region = curr_genomic_region

    print(">> all files loaded")

    ###################################################################################################
    # Creating random regions
    ###################################################################################################

    # if a random proportion is set, create random regions
    if args.rand_proportion:

        # Create random coordinates and name it random_regions
        rand_region = max_region.random_regions(
            args.organism, multiply_factor=args.rand_proportion, chrom_X=True)
        rand_region.sort()
        rand_region.name = random_region_name

        # Add random regions to the list of regions to perform matching on
        regions_to_match.append(rand_region)

        # Writing random regions
        output_file_name = npath(
            os.path.join(output_location, random_region_name))
        rand_bed_file_name = output_file_name + ".bed"
        rand_region.write(rand_bed_file_name)

        # Verifying condition to write bb
        if args.bigbed:

            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            try:
                # Converting to big bed
                bed_to_bb(rand_bed_file_name, chrom_sizes_file)

                # removing previously-created BED file
                os.remove(rand_bed_file_name)
            except Exception:
                err.throw_warning(
                    "DEFAULT_WARNING")  # FIXME: maybe error instead?

        print(">> random regions file created:", len(rand_region), "regions")

    ###################################################################################################
    # Creating PWMs
    ###################################################################################################

    if args.motif_dbs:
        ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True)
        # filter for dbs only if --motif_dbs is not set
        if 'database' in filter_values:
            del filter_values['database']
    else:
        if 'database' in filter_values:
            ms = MotifSet(preload_motifs=filter_values['database'])
        else:
            ms = MotifSet(preload_motifs="default")

    print(">> used database(s):",
          ",".join([str(db) for db in ms.motif_data.repositories_list]))

    # applying filtering pattern, taking a subset of the motif set
    if args.filter:
        ms = ms.filter(filter_values, search=args.filter_type)

    motif_list = ms.get_motif_list(args.pseudocounts, args.fpr)

    print(">> motifs loaded:", len(motif_list))

    # Performing normalized threshold strategy if requested
    if args.norm_threshold:
        threshold_list = [motif.threshold / motif.len for motif in motif_list]
        unique_threshold = sum(threshold_list) / len(threshold_list)
    else:
        unique_threshold = None

    scanner = scan.Scanner(7)
    pssm_list = []
    thresholds = []
    for motif in motif_list:
        if unique_threshold:
            thresholds.append(0.0)
            thresholds.append(0.0)
        else:
            thresholds.append(motif.threshold)
            thresholds.append(motif.threshold)

        pssm_list.append(motif.pssm)
        pssm_list.append(motif.pssm_rc)

    # Performing motif matching
    # TODO: we can expand this to use bg from sequence, for example,
    # or from organism.
    bg = tools.flat_bg(4)
    scanner.set_motifs(pssm_list, bg, thresholds)

    ###################################################################################################
    # Motif Matching
    ###################################################################################################

    # Creating genome file
    genome_file = Fastafile(genome_data.get_genome())

    print()

    # Iterating on list of genomic region sets
    for grs in regions_to_match:

        start = time.time()
        print(">> matching [",
              grs.name,
              "], ",
              len(grs),
              " regions... ",
              sep="",
              end='')
        sys.stdout.flush()

        # Initializing output bed file
        output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed")

        # must remove it because we append the MPBS
        if os.path.isfile(output_bed_file):
            os.remove(output_bed_file)

        # Iterating on genomic region set
        for genomic_region in grs:

            # Reading sequence associated to genomic_region
            sequence = str(
                genome_file.fetch(genomic_region.chrom, genomic_region.initial,
                                  genomic_region.final))

            grs_tmp = match_multiple(scanner, motif_list, sequence,
                                     genomic_region)

            # post-processing: if required, remove duplicate regions on opposing strands (keep highest score)
            if len(grs_tmp) > 1 and args.remove_strand_duplicates:
                grs_tmp.sort()
                seqs = grs_tmp.sequences
                seqs_new = []
                cur_pos = 0
                end_pos = len(seqs) - 1
                while cur_pos < end_pos:
                    gr = seqs[cur_pos]

                    new_pos = cur_pos + 1
                    while new_pos < end_pos:
                        gr2 = seqs[new_pos]

                        # if this sequence is unrelated, we move on
                        if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation:
                            break

                        if float(gr.data) < float(gr2.data):
                            gr = gr2

                        new_pos = new_pos + 1

                    # adding the currently-selected genomic region
                    seqs_new.append(gr)

                    # at the next loop, we start from the next right-handed sequences
                    cur_pos = new_pos

                # edge case: the last element was not considered
                # (when it is, cur_pos == end_pos+1)
                if cur_pos == end_pos:
                    seqs_new.append(seqs[cur_pos])

                grs_tmp.sequences = seqs_new

            grs_tmp.write(output_bed_file, mode="a")

        del grs.sequences[:]

        # Verifying condition to write bb
        if args.bigbed and args.normalize_bitscore:
            # Fetching file with chromosome sizes
            chrom_sizes_file = genome_data.get_chromosome_sizes()

            # Converting to big bed
            bed_to_bb(output_bed_file, chrom_sizes_file)

            # removing BED file
            os.remove(output_bed_file)

        secs = time.time() - start
        print("[", "%02.3f" % secs, " seconds]", sep="")