示例#1
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[bp.get_parser(), sp.get_parser()])
    parser.add_argument("-k",
                        dest="read_length",
                        metavar="READ_LENGTH",
                        type=int,
                        default=29,
                        help="K-mer length to generate from input file. " +
                        "(Default: 29)")
    parser.add_argument(
        "--offset",
        type=int,
        default=14,
        help=
        "Offset from 5' end of plus-strand read at which to attribute score (Default: 14)"
    )
    parser.add_argument("--mismatches",
                        metavar="N",
                        type=int,
                        default=0,
                        help="Number of mismatches tolerated in alignment. " +
                        "(Default: 0)")
    parser.add_argument(
        "--bowtie",
        dest="bowtie",
        default="/usr/local/bin/bowtie",
        type=str,
        help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)")
    parser.add_argument("--have_kmers",default=False,action="store_true",
                        help="If specified, use k-mer files from previous run. "+\
                             " In this case 'sequence_file' should be the value "+\
                             "'outbase' from the k-mer files you want to use.")
    parser.add_argument("--save_kmers",
                        default=False,
                        action="store_true",
                        help="Save k-mer files for reuse in a subsequent run.")
    parser.add_argument(
        "-p",
        "--processes",
        type=int,
        default=2,
        metavar="N",
        help="Number of processes to use (should be <= number of chromosomes")
    parser.add_argument(
        "ebwt",
        type=str,
        help=
        "Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`."
    )
    parser.add_argument("outbase", type=str, help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    #filenames
    base = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches)
    bed_file = "%s_crossmap.bed" % base

    #if not os.path.exists(args.sequence_file):
    #    printer.write("Could not find source file: %s" % args.sequence_file)
    #    printer.write("Exiting.")
    #    sys.exit(1)

    if args.have_kmers == True:
        import glob
        kmer_files = glob.glob(args.sequence_file + "*kmers.fa")
        seq_pat = re.compile(r".*_([^_]*)_kmers.fa")
        seqs = {seq_pat.search(X).groups()[0]: X for X in kmer_files}
    else:
        seqs = sp.get_seqdict_from_args(args, index=True)

    worker = functools.partial(chrom_worker, args=args)
    chroms = seqs.items()

    pool = multiprocessing.Pool(processes=args.processes)
    bed_filenames = pool.map(worker, chroms, 1)
    pool.close()
    pool.join()

    with open(bed_file, "w") as fout:
        for f in sorted(bed_filenames):
            shutil.copyfileobj(open(f, "r"), fout)
            os.remove(f)

    fout.close()

    printer.write("Done.")
    printer.write(
        BigBedMessage.replace("OUTFILE", bed_file.replace(".bed", "")).replace(
            "BOWTIE_INDEX", args.ebwt))
示例#2
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    bp = BaseParser()
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser()])
    parser.add_argument("-k",dest="read_length",metavar="READ_LENGTH",
                        type=int,default=29,
                        help="K-mer length to generate from input file. "+
                             "(Default: 29)")
    parser.add_argument("--offset",type=int,default=14,
                        help="Offset from 5' end of plus-strand read at which to attribute score (Default: 14)")
    parser.add_argument("--mismatches",metavar="N",
                        type=int,default=0,
                        help="Number of mismatches tolerated in alignment. "+
                           "(Default: 0)")
    parser.add_argument("--bowtie",dest="bowtie",default="/usr/local/bin/bowtie",
                        type=str,
                        help="Location of bowtie binary (Default: ``/usr/local/bin/bowtie``)")
    parser.add_argument("--have_kmers",default=False,action="store_true",
                        help="If specified, use k-mer files from previous run. "+\
                             " In this case 'sequence_file' should be the value "+\
                             "'outbase' from the k-mer files you want to use.")
    parser.add_argument("--save_kmers",default=False,action="store_true",
                        help="Save k-mer files for reuse in a subsequent run.")
    parser.add_argument("-p","--processes",type=int,default=2,metavar="N",
                        help="Number of processes to use (should be <= number of chromosomes")
    parser.add_argument("ebwt",type=str,
                        help="Bowtie index of genome against which crossmap will be made. In most cases, should be generated from the same sequences that are in `sequence_file`.")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)


    #filenames
    base         = "%s_%s_%s" % (args.outbase, args.read_length, args.mismatches)
    bed_file     = "%s_crossmap.bed" % base

    #if not os.path.exists(args.sequence_file):
    #    printer.write("Could not find source file: %s" % args.sequence_file)
    #    printer.write("Exiting.")
    #    sys.exit(1)

    if args.have_kmers == True:
        import glob
        kmer_files = glob.glob(args.sequence_file+"*kmers.fa")
        seq_pat = re.compile(r".*_([^_]*)_kmers.fa")
        seqs = { seq_pat.search(X).groups()[0] : X for X in kmer_files }
    else:
        seqs = sp.get_seqdict_from_args(args,index=True) 

    worker = functools.partial(chrom_worker,args=args)
    chroms = seqs.items()

    pool = multiprocessing.Pool(processes=args.processes)
    bed_filenames = pool.map(worker,chroms,1)
    pool.close()
    pool.join()
   
    with open(bed_file,"w") as fout:
        for f in sorted(bed_filenames):
            shutil.copyfileobj(open(f,"r"),fout)
            os.remove(f)

    fout.close()

    printer.write("Done.")
    printer.write(BigBedMessage.replace("OUTFILE",bed_file.replace(".bed","")).replace("BOWTIE_INDEX",args.ebwt))
示例#3
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")