def test_match_multiple(self): ms = MotifSet(preload_motifs="default") ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact") self.assertEqual(len(ms), 1) motif = ms.get_motif_list(1, 0.0001)[0] scanner = scan.Scanner(7) pssm_list, thresholds = [], [] thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 0, 5022) # Reading sequence associated to genomic_region sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
def test_match_multiple(self): dirname = os.path.dirname(__file__) jasp_dir = "../../data/motifs/jaspar_vertebrates/" scanner = scan.Scanner(7) pssm_list = [] thresholds = [] motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None) thresholds.append(motif.threshold) thresholds.append(motif.threshold_rc) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 0, 5022) # Reading sequence associated to genomic_region sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
def main(args): """ Performs motif matching. """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing Error Handler err = ErrorHandler() # Additional Parameters matching_folder_name = "match" random_region_name = "random_regions" filter_values = parse_filter(args.filter) ################################################################################################### # Initializations ################################################################################################### # Output folder if args.output_location: output_location = args.output_location else: output_location = npath(matching_folder_name) print(">> output location:", output_location) # Default genomic data genome_data = GenomeData(args.organism) print(">> genome:", genome_data.organism) print(">> pseudocounts:", args.pseudocounts) print(">> fpr threshold:", args.fpr) ################################################################################################### # Reading Input Regions ################################################################################################### genomic_regions_dict = {} # get experimental matrix, if available if args.input_matrix: try: exp_matrix = ExperimentalMatrix() exp_matrix.read(args.input_matrix) # if the matrix is present, the (empty) dictionary is overwritten genomic_regions_dict = exp_matrix.objectsDict print(">>> experimental matrix loaded") except Exception: err.throw_error("MM_WRONG_EXPMAT") elif args.input_files: # get input files, if available for input_filename in args.input_files: name, _ = os.path.splitext(os.path.basename(input_filename)) regions = GenomicRegionSet(name) regions.read(npath(input_filename)) genomic_regions_dict[name] = regions print(">>> input file", name, "loaded:", len(regions), "regions") # we put this here because we don't want to create the output directory unless we # are sure the initialisation (including loading input files) worked try: if not os.path.isdir(output_location): os.makedirs(output_location) except Exception: err.throw_error("MM_OUT_FOLDER_CREATION") annotation = None target_genes = None # get promoter regions from list of genes (both target and background) # TODO: should be more clever, allow precomputed regions etc if args.target_genes_filename: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) target_genes = GeneSet("target_genes") target_genes.read(args.target_genes_filename) # TODO: what do we do with unmapped genes? maybe just print them out target_regions = annotation.get_promoters( gene_set=target_genes, promoter_length=args.promoter_length) target_regions.name = "target_regions" target_regions.sort() output_file_name = npath( os.path.join(output_location, target_regions.name + ".bed")) target_regions.write(output_file_name) genomic_regions_dict[target_regions.name] = target_regions print(">>> target promoter file created:", len(target_regions), "regions") # we make a background in case it's requested, but also in case a list of target genes has not been # provided if args.promoter_make_background or (args.promoters_only and not args.target_genes_filename): if not annotation: annotation = AnnotationSet(args.organism, alias_source=args.organism, protein_coding=True, known_only=True) # background is made of all known genes minus the target genes (if any) background_genes = GeneSet("background_genes") background_genes.get_all_genes(organism=args.organism) if target_genes: background_genes.subtract(target_genes) background_regions = annotation.get_promoters( gene_set=background_genes, promoter_length=args.promoter_length) background_regions.name = "background_regions" background_regions.sort() output_file_name = npath( os.path.join(output_location, background_regions.name + ".bed")) background_regions.write(output_file_name) genomic_regions_dict[background_regions.name] = background_regions print(">>> background promoter file created:", len(background_regions), "regions") if not genomic_regions_dict: err.throw_error( "DEFAULT_ERROR", add_msg= "You must either specify an experimental matrix, or at least a " "valid input file, or one of the 'promoter test' options.") max_region_len = 0 max_region = None regions_to_match = [] # Iterating on experimental matrix objects for k in genomic_regions_dict.keys(): curr_genomic_region = genomic_regions_dict[k] # If the object is a GenomicRegionSet if isinstance(curr_genomic_region, GenomicRegionSet): if args.rmdup: # remove duplicates and sort regions curr_genomic_region.remove_duplicates(sort=True) else: # sort regions curr_genomic_region.sort() # Append label and GenomicRegionSet regions_to_match.append(curr_genomic_region) # Verifying max_region_len for random region generation curr_len = len(curr_genomic_region) if curr_len > max_region_len: max_region_len = curr_len max_region = curr_genomic_region print(">> all files loaded") ################################################################################################### # Creating random regions ################################################################################################### # if a random proportion is set, create random regions if args.rand_proportion: # Create random coordinates and name it random_regions rand_region = max_region.random_regions( args.organism, multiply_factor=args.rand_proportion, chrom_X=True) rand_region.sort() rand_region.name = random_region_name # Add random regions to the list of regions to perform matching on regions_to_match.append(rand_region) # Writing random regions output_file_name = npath( os.path.join(output_location, random_region_name)) rand_bed_file_name = output_file_name + ".bed" rand_region.write(rand_bed_file_name) # Verifying condition to write bb if args.bigbed: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() try: # Converting to big bed bed_to_bb(rand_bed_file_name, chrom_sizes_file) # removing previously-created BED file os.remove(rand_bed_file_name) except Exception: err.throw_warning( "DEFAULT_WARNING") # FIXME: maybe error instead? print(">> random regions file created:", len(rand_region), "regions") ################################################################################################### # Creating PWMs ################################################################################################### if args.motif_dbs: ms = MotifSet(preload_motifs=args.motif_dbs, motif_dbs=True) # filter for dbs only if --motif_dbs is not set if 'database' in filter_values: del filter_values['database'] else: if 'database' in filter_values: ms = MotifSet(preload_motifs=filter_values['database']) else: ms = MotifSet(preload_motifs="default") print(">> used database(s):", ",".join([str(db) for db in ms.motif_data.repositories_list])) # applying filtering pattern, taking a subset of the motif set if args.filter: ms = ms.filter(filter_values, search=args.filter_type) motif_list = ms.get_motif_list(args.pseudocounts, args.fpr) print(">> motifs loaded:", len(motif_list)) # Performing normalized threshold strategy if requested if args.norm_threshold: threshold_list = [motif.threshold / motif.len for motif in motif_list] unique_threshold = sum(threshold_list) / len(threshold_list) else: unique_threshold = None scanner = scan.Scanner(7) pssm_list = [] thresholds = [] for motif in motif_list: if unique_threshold: thresholds.append(0.0) thresholds.append(0.0) else: thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) # Performing motif matching # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) ################################################################################################### # Motif Matching ################################################################################################### # Creating genome file genome_file = Fastafile(genome_data.get_genome()) print() # Iterating on list of genomic region sets for grs in regions_to_match: start = time.time() print(">> matching [", grs.name, "], ", len(grs), " regions... ", sep="", end='') sys.stdout.flush() # Initializing output bed file output_bed_file = os.path.join(output_location, grs.name + "_mpbs.bed") # must remove it because we append the MPBS if os.path.isfile(output_bed_file): os.remove(output_bed_file) # Iterating on genomic region set for genomic_region in grs: # Reading sequence associated to genomic_region sequence = str( genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs_tmp = match_multiple(scanner, motif_list, sequence, genomic_region) # post-processing: if required, remove duplicate regions on opposing strands (keep highest score) if len(grs_tmp) > 1 and args.remove_strand_duplicates: grs_tmp.sort() seqs = grs_tmp.sequences seqs_new = [] cur_pos = 0 end_pos = len(seqs) - 1 while cur_pos < end_pos: gr = seqs[cur_pos] new_pos = cur_pos + 1 while new_pos < end_pos: gr2 = seqs[new_pos] # if this sequence is unrelated, we move on if gr.name != gr2.name or gr.chrom != gr2.chrom or gr.initial != gr2.initial or gr.final != gr2.final or gr.orientation == gr2.orientation: break if float(gr.data) < float(gr2.data): gr = gr2 new_pos = new_pos + 1 # adding the currently-selected genomic region seqs_new.append(gr) # at the next loop, we start from the next right-handed sequences cur_pos = new_pos # edge case: the last element was not considered # (when it is, cur_pos == end_pos+1) if cur_pos == end_pos: seqs_new.append(seqs[cur_pos]) grs_tmp.sequences = seqs_new grs_tmp.write(output_bed_file, mode="a") del grs.sequences[:] # Verifying condition to write bb if args.bigbed and args.normalize_bitscore: # Fetching file with chromosome sizes chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed bed_to_bb(output_bed_file, chrom_sizes_file) # removing BED file os.remove(output_bed_file) secs = time.time() - start print("[", "%02.3f" % secs, " seconds]", sep="")