def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() bp = BaseParser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(), sp.get_parser()]) parser.add_argument("-k", dest="read_length", metavar="READ_LENGTH", type=int, default=29, help="K-mer length to generate from input file. " + "(Default: 29)") parser.add_argument( "--offset", type=int, default=14, help= "Offset from 5' end of plus-strand read at which to attribute score (Default: 14)" ) parser.add_argument("--mismatches", metavar="N", type=int, default=0, help="Number of mismatches tolerated in alignment. " + "(Default: 0)") parser.add_argument( "--bowtie", dest="bowtie", default="/usr/local/bin/bowtie", type=str, help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)") parser.add_argument("--have_kmers",default=False,action="store_true", help="If specified, use k-mer files from previous run. "+\ " In this case 'sequence_file' should be the value "+\ "'outbase' from the k-mer files you want to use.") parser.add_argument("--save_kmers", default=False, action="store_true", help="Save k-mer files for reuse in a subsequent run.") parser.add_argument( "-p", "--processes", type=int, default=2, metavar="N", help="Number of processes to use (should be <= number of chromosomes") parser.add_argument( "ebwt", type=str, help= "Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`." ) parser.add_argument("outbase", type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) #filenames base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches) bed_file = "%s_crossmap.bed" % base #if not os.path.exists(args.sequence_file): # printer.write("Could not find source file: %s" % args.sequence_file) # printer.write("Exiting.") # sys.exit(1) if args.have_kmers == True: import glob kmer_files = glob.glob(args.sequence_file + "*kmers.fa") seq_pat = re.compile(r".*_([^_]*)_kmers.fa") seqs = {seq_pat.search(X).groups()[0]: X for X in kmer_files} else: seqs = sp.get_seqdict_from_args(args, index=True) worker = functools.partial(chrom_worker, args=args) chroms = seqs.items() pool = multiprocessing.Pool(processes=args.processes) bed_filenames = pool.map(worker, chroms, 1) pool.close() pool.join() with open(bed_file, "w") as fout: for f in sorted(bed_filenames): shutil.copyfileobj(open(f, "r"), fout) os.remove(f) fout.close() printer.write("Done.") printer.write( BigBedMessage.replace("OUTFILE", bed_file.replace(".bed", "")).replace( "BOWTIE_INDEX", args.ebwt))
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser()]) parser.add_argument("-k",dest="read_length",metavar="READ_LENGTH", type=int,default=29, help="K-mer length to generate from input file. "+ "(Default: 29)") parser.add_argument("--offset",type=int,default=14, help="Offset from 5' end of plus-strand read at which to attribute score (Default: 14)") parser.add_argument("--mismatches",metavar="N", type=int,default=0, help="Number of mismatches tolerated in alignment. "+ "(Default: 0)") parser.add_argument("--bowtie",dest="bowtie",default="/usr/local/bin/bowtie", type=str, help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)") parser.add_argument("--have_kmers",default=False,action="store_true", help="If specified, use k-mer files from previous run. "+\ " In this case 'sequence_file' should be the value "+\ "'outbase' from the k-mer files you want to use.") parser.add_argument("--save_kmers",default=False,action="store_true", help="Save k-mer files for reuse in a subsequent run.") parser.add_argument("-p","--processes",type=int,default=2,metavar="N", help="Number of processes to use (should be <= number of chromosomes") parser.add_argument("ebwt",type=str, help="Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`.") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) #filenames base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches) bed_file = "%s_crossmap.bed" % base #if not os.path.exists(args.sequence_file): # printer.write("Could not find source file: %s" % args.sequence_file) # printer.write("Exiting.") # sys.exit(1) if args.have_kmers == True: import glob kmer_files = glob.glob(args.sequence_file+"*kmers.fa") seq_pat = re.compile(r".*_([^_]*)_kmers.fa") seqs = { seq_pat.search(X).groups()[0] : X for X in kmer_files } else: seqs = sp.get_seqdict_from_args(args,index=True) worker = functools.partial(chrom_worker,args=args) chroms = seqs.items() pool = multiprocessing.Pool(processes=args.processes) bed_filenames = pool.map(worker,chroms,1) pool.close() pool.join() with open(bed_file,"w") as fout: for f in sorted(bed_filenames): shutil.copyfileobj(open(f,"r"),fout) os.remove(f) fout.close() printer.write("Done.") printer.write(BigBedMessage.replace("OUTFILE",bed_file.replace(".bed","")).replace("BOWTIE_INDEX",args.ebwt))
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")