Пример #1
0
def main():
    args = get_args()
    pth = os.path.join(args.fastas, "*.fasta")
    outf = FastaWriter(args.outfile)
    conn = sqlite3.connect(args.db)
    cur = conn.cursor()
    counter = 0
    for infile in glob.glob(pth):
        sp = os.path.basename(infile).split('.')[0].replace('-','_')
        species = sp.replace('_',' ').capitalize()
        print "Working on {}".format(species)
        partial = species.split(' ')[0].lower()[:3]
        for read in FastaReader(infile):
            # check for header match, if match get locus name for header
            nn = read.identifier.split("_")[:2]
            nn = "{}_{}".format(nn[0].strip('>').lower(), nn[1].lower())
            query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format(sp, nn)
            cur.execute(query)
            result = cur.fetchall()
            #pdb.set_trace()
            if result:
                assert len(result) == 1, "More than 1 result"
                #pdb.set_trace()
                if args.fish:
                    uce = result[0][0].split('_')[0]
                else:
                    uce = result[0][0]
                read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format(species, uce, partial, counter)
                # write all to a common fasta
                outf.write(read)
                # if not match, pass
                counter += 1
            else:
                pass
    outf.close()
Пример #2
0
def parse_fasta_and_write_new_file(results, contigs, output):
    #pdb.set_trace()
    for taxon, rows in results.iteritems():
        outp = FastaWriter(os.path.join(output, "{}.fasta".format(taxon)))
        inp = "{}.contigs.fasta".format(taxon.replace('_', '-'))
        fasta_file = FastaReader(os.path.join(contigs, inp))
        for fasta in fasta_file:
            name = '_'.join(fasta.identifier.lstrip('>').split('_')[:2]).lower()
            if name in rows:
                outp.write(fasta)
        outp.close()
def main():
    args = get_args()
    conf = ConfigParser.ConfigParser(allow_no_value=True)
    conf.read(args.conf)
    # get metadata from conf file
    taxon_excludes = get_excludes(conf, "exclude taxa")
    locus_excludes = get_excludes(conf, "exclude loci")
    metadata = get_metadata(conf)
    vouchers = get_vouchers(conf)
    #pdb.set_trace()
    remap = get_remaps(conf)
    # get fasta and db locations
    pth = os.path.join(args.fastas, "*.fasta")
    outf = FastaWriter(args.outfile)
    conn = sqlite3.connect(args.db)
    cur = conn.cursor()
    counter = args.start_value
    # iterate over fasta files
    for infile in glob.glob(pth):
        sp, species, partial, oldname = get_species_name(infile, remap)
        if species.lower() not in taxon_excludes:
            print "Working on {}".format(species)
            for read in FastaReader(infile):
                nodename = get_node_name(read)
                query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format(oldname, nodename)
                cur.execute(query)
                result = cur.fetchall()
                if result:
                    # ensure we get only 1 result
                    assert len(result) == 1, "More than 1 result"
                    # if getting fish data TODO: deprecate
                    if args.fish:
                        uce = result[0][0].split('_')[0]
                    else:
                        uce = result[0][0]
                    if uce not in locus_excludes:
                        read.identifier = get_new_identifier(species, uce, partial, counter, metadata, vouchers)
                        #read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format(species, uce, partial, counter)
                        # write all to a common fasta
                        outf.write(read)
                        # if not match, pass
                        counter += 1
                else:
                    pass
        else:
            print "Skipping {0}".format(species)
    outf.close()
Пример #4
0
def write_sequences(record, header, output, sample_map, count):
    if sample_map is not None:
        header.name = sample_map[header.cluster.lower()]
    else:
        header.name = header.cluster
    record.identifier += " name={}".format(header.name)
    # create the cluster-specific output directory if not exists
    outdir = os.path.join(output, header.name)
    mkdir_p(outdir)
    outf = FastaWriter(
        os.path.join(outdir, "{}.fasta".format(header.name)),
        os.path.join(outdir, "{}.qual".format(header.name)),
        mode="a",
    )
    if count != 0 and count % 1000 == 0:
        sys.stdout.write(".")
        sys.stdout.flush()
    outf.write(record)
    outf.close()
    count += 1
    return count, header
Пример #5
0
def parse_fasta_and_write_new_file(results, contigs, output):
    #pdb.set_trace()
    for taxon, rows in results.iteritems():
        outp = FastaWriter(os.path.join(output, "{}.fasta".format(taxon)))
        inp = "{}.contigs.fasta".format(taxon.replace('_', '-'))
        fasta_file = FastaReader(os.path.join(contigs, inp))
        for fasta in fasta_file:
            name = '_'.join(
                fasta.identifier.lstrip('>').split('_')[:2]).lower()
            if name in rows:
                outp.write(fasta)
        outp.close()
def main():
    args = get_args()
    pth = os.path.join(args.fastas, "*.fasta")
    outf = FastaWriter(args.outfile)
    conn = sqlite3.connect(args.db)
    cur = conn.cursor()
    counter = 0
    for infile in glob.glob(pth):
        sp = os.path.basename(infile).split('.')[0].replace('-', '_')
        species = sp.replace('_', ' ').capitalize()
        print "Working on {}".format(species)
        partial = species.split(' ')[0].lower()[:3]
        for read in FastaReader(infile):
            # check for header match, if match get locus name for header
            nn = read.identifier.split("_")[:2]
            nn = "{}_{}".format(nn[0].strip('>').lower(), nn[1].lower())
            query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format(
                sp, nn)
            cur.execute(query)
            result = cur.fetchall()
            #pdb.set_trace()
            if result:
                assert len(result) == 1, "More than 1 result"
                #pdb.set_trace()
                if args.fish:
                    uce = result[0][0].split('_')[0]
                else:
                    uce = result[0][0]
                read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format(
                    species, uce, partial, counter)
                # write all to a common fasta
                outf.write(read)
                # if not match, pass
                counter += 1
            else:
                pass
    outf.close()
Пример #7
0
def main():
    """Main loop"""
    start_time = time.time()
    motd()
    args = get_args()
    print 'Started: ', time.strftime("%a %b %d, %Y  %H:%M:%S", time.localtime(start_time))
    # build our configuration object w/ input params
    conf = ConfigParser.ConfigParser()
    conf.read(args.config)
    params = Parameters(conf)
    # create the db and tables, returning connection
    # and cursor
    conn, cur = db.create_db_and_new_tables(params.db)
    # get num reads and split up work
    num_reads, work = get_work(params)
    # setup monolithic output files
    outf = FastaWriter(params.output_fasta, params.output_qual)
    # MULTICORE
    if params.multiprocessing and params.num_procs > 1:
        jobs = Queue()
        results = JoinableQueue()
        # We're stacking groups of jobs on the work
        # Queue, conceivably to save the overhead of
        # placing them on there one-by-one.
        for unit in work:
            jobs.put(unit)
        # setup the processes for the jobs
        sys.stdout.write("Starting {} workers\n".format(params.num_procs))
        sys.stdout.flush()
        sys.stdout.write('Running')
        # start the worker processes
        [Process(target = multiproc, args=(jobs, results, params)).start()
            for i in xrange(params.num_procs)]
        # we're putting single results on the results Queue so
        # that the db can (in theory) consume them at
        # a rather consistent rate rather than in spurts
        #for unit in xrange(num_reads):
        for unit in xrange(num_reads):
            tagged = results.get()
            results.task_done()
            db.insert_record_to_db(cur, tagged)
            if tagged.cluster:
                tagged.read.identifier += " cluster={0} outer={1} inner={2}".format(
                    tagged.cluster,
                    tagged.outer_type,
                    tagged.inner_type
                )
                outf.write(tagged.read)
        # make sure we put None at end of Queue
        # in an amount equiv. to num_procs
        for unit in xrange(params.num_procs):
            jobs.put(None)
        # join the results, so that they can finish
        results.join()
        # close up our queues
        jobs.close()
        results.close()

    # SINGLECORE
    else:
        # fake a multiprocessing queue, so stacking and accessing results
        # is identical.
        results = ListQueue()
        singleproc(work, results, params)
        for tagged in results:
            db.insert_record_to_db(cur, tagged)
            if tagged.cluster:
                tagged.read.identifier += " cluster={0} outer={1} inner={2}".format(
                    tagged.cluster,
                    tagged.outer_type,
                    tagged.inner_type
                )
                outf.write(tagged.read)
    conn.commit()
    cur.close()
    conn.close()
    outf.close()
    end_time = time.time()
    pretty_end_time = time.strftime("%a %b %d, %Y  %H:%M:%S", time.localtime(end_time))
    print "\nEnded: {} (run time {} minutes)".format(pretty_end_time,
            round((end_time - start_time)/60, 3))