示例#1
0
    genome_path = dir_in+filename
    dbfile_path = "data/blast_db/"+rec_name

    while True:
        if not path.exists(dbfile_path+".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
                except IOError:
                    print "failed to load Genbank file"
                    break
                else:
                    try:
                        genome_path = dir_in+rec_name+".fas"
                        write_fasta(genome_path, record)
                    except Exception:
                        print "failed to write Fasta file"
                        break
            try:
                print "making a DB,",
                make_blastDB(dbfile_path, genome_path, 'nucl')
            except IOError:
                print "failed to make DB"
                break
        try:
            # blastx against each genome DB
            outfile = blast_out+rec_name+".txt"
            prefs = {'evalue': 0.001, 'outfmt_pref': 6}
            print "blasting,",
            if blast_mode == 'n':
示例#2
0
    genome_path = dir_in + filename
    dbfile_path = "data/blast_db/" + rec_name

    while True:
        if not path.exists(dbfile_path + ".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
                except IOError:
                    print "failed to load Genbank file"
                    break
                else:
                    try:
                        genome_path = dir_in + rec_name + ".fas"
                        write_fasta(genome_path, record)
                    except Exception:
                        print "failed to write Fasta file"
                        break
            try:
                print "making a DB,",
                make_blastDB(dbfile_path, genome_path, 'nucl')
            except IOError:
                print "failed to make DB"
                break
        try:
            # blastx against each genome DB
            outfile = blast_out + rec_name + ".txt"
            prefs = {'evalue': 0.001, 'outfmt_pref': 6}
            print "blasting,",
            if blast_mode == 'n':
示例#3
0
                # fetch contig records
                ctg_count = 0
                while ctg_count < ctg_num:
                    # TODO: better formatting
                    ctg_count += 1
                    if ctg_count < 10:
                        ctg_id = base_code + '0000' + str(ctg_count)
                    elif ctg_count < 100:
                        ctg_id = base_code + '000' + str(ctg_count)
                    elif ctg_count < 1000:
                        ctg_id = base_code + '00' + str(ctg_count)
                    else:  # shouldn't happen but hey...
                        ctg_id = base_code + '0' + str(ctg_count)
                    # fetch contig record
                    try:
                        fname = EFetcher(ctg_id[3:], seqdir)  # 3 if not NZ_
                    except Exception:
                        print "Error retrieving record"
                    else:
                        try:
                            records.append(load_genbank(fname))
                        except Exception:
                            print "Error loading record"

                write_fasta(data_dir + rec_id + ".fas", records)

            print "OK"
            break

# confirm complete stop
print "BatchFetcher has downloaded " + str(counter) + " records to file."
示例#4
0
# script to translate sequences in multifasta files into proteins

from sys import argv
from libs.common import load_multifasta, write_fasta
from Bio.SeqRecord import SeqRecord

origin_dir = "data/"+argv[1]+"/"
in_file = origin_dir+argv[2]
outfile = in_file[:-4]+"_aa.fas"

proteins = []

for record in load_multifasta(in_file):
    aa_rec = SeqRecord(id=record.id, seq=record.seq.translate())
    proteins.append(aa_rec)

write_fasta(outfile, proteins)
示例#5
0
                # fetch contig records
                ctg_count = 0
                while ctg_count < ctg_num:
                    # TODO: better formatting
                    ctg_count += 1
                    if ctg_count < 10:
                        ctg_id = base_code+'0000'+str(ctg_count)
                    elif ctg_count < 100:
                        ctg_id = base_code+'000'+str(ctg_count)
                    elif ctg_count < 1000:
                        ctg_id = base_code+'00'+str(ctg_count)
                    else: # shouldn't happen but hey...
                        ctg_id = base_code+'0'+str(ctg_count)
                    # fetch contig record
                    try:
                        fname = EFetcher(ctg_id[3:], seqdir) # 3 if not NZ_
                    except Exception:
                        print "Error retrieving record"
                    else:
                        try:
                            records.append(load_genbank(fname))
                        except Exception:
                            print "Error loading record"

                write_fasta(data_dir+rec_id+".fas", records)

            print "OK"
            break

# confirm complete stop
print "BatchFetcher has downloaded " + str(counter) + " records to file."
示例#6
0
                # create file
                out_handle = open(outfile, 'w')
            else:
                counter +=1
                out_handle = open(outfile, 'a')
            out_handle.write("\t".join([str(item) for item in line])+"\n")
            # extract sequence to array
            rev_flag = False
            if line[8] < line[9]:
                q_start, q_stop = line[8]-1, line[9]
                rev_flag = False
            else:
                q_start, q_stop = line[9]-1, line[8]
                rev_flag = True
            master_seq = load_fasta("data/contigs_fas/"+subject+".fas")
            seq_bit = master_seq[q_start:q_stop]
            if rev_flag:
                seq_bit = seq_bit.reverse_complement()
            record = SeqRecord(id=subject+"_"+str(counter), seq=seq_bit.seq)
            if query not in records_dict.keys():
                records_dict[query] = [record]
            else:
                records_dict[query].append(record)
# write out sequences
for query in records_dict.keys():
    seqfile_nt = data_dir+query+"_nt.fas"
    write_fasta(seqfile_nt, records_dict[query])



示例#7
0
## script to combine several fasta sequences into a single one in a specific order

from sys import argv
from libs.common import  load_fasta, write_fasta

origin_dir = "data/"+argv[1]+"/"
destin_file = origin_dir+argv[2]+".fas"
base_name = argv[3]

# adapt this part
order = [(22, 0), (4, 0), (57, 1), (43, 1), (64, 0), (18, 0), (54, 0), (36, 1), (20, 1), (2, 1), (40, 1), (17, 1), (35, 1), (38, 1), (37, 1), (55, 1), (19, 1), (47, 1), (11, 0), (46, 0), (61, 0), (41, 1), (15, 0), (1, 1), (5, 1), (6, 0), (13, 1), (8, 0), (23, 0), (16, 1), (10, 0), (60, 0), (14, 0), (42, 0), (39, 0), (48, 0), (9, 1), (21, 0), (3, 1), (58, 1), (32, 0)]

filename = origin_dir+base_name+str(order[0][0])+".fas"
record = load_fasta(filename)
if order[0][1]:
    record = record.reverse_complement()

for index in order[1:]:
    filename = origin_dir+base_name+str(index[0])+".fas"
    new_rec = load_fasta(filename)
    if index[1]:
        new_rec = new_rec.reverse_complement()
    record += new_rec
    record += "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"

record.id = argv[2]

write_fasta(destin_file, record)
示例#8
0
            outfile = data_dir + query + "_results.txt"
            if not path.exists(outfile):
                # create file
                out_handle = open(outfile, 'w')
            else:
                counter += 1
                out_handle = open(outfile, 'a')
            out_handle.write("\t".join([str(item) for item in line]) + "\n")
            # extract sequence to array
            rev_flag = False
            if line[8] < line[9]:
                q_start, q_stop = line[8] - 1, line[9]
                rev_flag = False
            else:
                q_start, q_stop = line[9] - 1, line[8]
                rev_flag = True
            master_seq = load_fasta("data/contigs_fas/" + subject + ".fas")
            seq_bit = master_seq[q_start:q_stop]
            if rev_flag:
                seq_bit = seq_bit.reverse_complement()
            record = SeqRecord(id=subject + "_" + str(counter),
                               seq=seq_bit.seq)
            if query not in records_dict.keys():
                records_dict[query] = [record]
            else:
                records_dict[query].append(record)
# write out sequences
for query in records_dict.keys():
    seqfile_nt = data_dir + query + "_nt.fas"
    write_fasta(seqfile_nt, records_dict[query])
示例#9
0
    rev_flag = False
    if line[8] < line[9]:
        q_start, q_stop = line[8] - 1, line[9]
        rev_flag = False
    else:
        q_start, q_stop = line[9] - 1, line[8]
        rev_flag = True

    c_start, c_stop = q_start - capture_span, q_stop + capture_span

    master_seq = load_fasta("data/contigs_fas/" + subject + ".fas")

    if c_start < 0:
        c_start = 0
    if c_stop > len(master_seq.seq):
        c_stop = len(master_seq.seq)

    seq_bit = master_seq[c_start:c_stop]

    if rev_flag:
        seq_bit = seq_bit.reverse_complement()
    record = SeqRecord(id=subject, seq=seq_bit.seq, description=descript)
    records.append(record)

    rec_file = ctx_dir + subject + "_" + query + "_ctxt.fas"
    write_fasta(rec_file, record)

write_fasta(main_out, records)

### TODO: modify bb tools to accept multifasta as sole input
示例#10
0
    rev_flag = False
    if line[8] < line[9]:
        q_start, q_stop = line[8]-1, line[9]
        rev_flag = False
    else:
        q_start, q_stop = line[9]-1, line[8]
        rev_flag = True

    c_start, c_stop = q_start-capture_span, q_stop+capture_span

    master_seq = load_fasta("data/contigs_fas/"+subject+".fas")

    if c_start < 0:
        c_start = 0
    if c_stop > len(master_seq.seq):
        c_stop = len(master_seq.seq)

    seq_bit = master_seq[c_start:c_stop]

    if rev_flag:
        seq_bit = seq_bit.reverse_complement()
    record = SeqRecord(id=subject, seq=seq_bit.seq, description=descript)
    records.append(record)

    rec_file = ctx_dir+subject+"_"+query+"_ctxt.fas"
    write_fasta(rec_file, record)

write_fasta(main_out, records)

### TODO: modify bb tools to accept multifasta as sole input
示例#11
0
records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in + "/" + filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
                    new_rec = feat.extract(record)
                    new_rec.id = rec_name + '_' + feat_name
                    new_rec.description = "Extracted from " + new_rec.description
                    records.append(new_rec)

            except KeyError:
                pass

print ''
write_fasta(main_out, records)
示例#12
0
records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in+"/"+filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
                    new_rec = feat.extract(record)
                    new_rec.id = rec_name+'_'+feat_name
                    new_rec.description = "Extracted from "+new_rec.description
                    records.append(new_rec)

            except KeyError:
                pass

print ''
write_fasta(main_out, records)
示例#13
0
            try:
                records = [load_fasta(origin_file)]
            except IOError:
                print "failed to load file"
                break

        elif genome['input'] == 'mfas':
            try:
                records = load_multifasta(origin_file)
            except IOError:
                print "failed to load file"
                break

        else:
            print "input not recognized"
            break

        for record in records:
            try:
                write_fasta(destin_dir+record.id+".fas", record)
            except Exception:
                print "failed to write contig file"
                break
            else:
                print record.id,

        print "OK"
        break


示例#14
0
                feat for feat in record.features if feat.type == feat_type
            ]
            feat_cnt = 0

            # cycle through selected features
            for feat in select:

                feat_cnt += 1
                rec = feat.extract(record)
                rec.description = genome['name'] + '_' + feat_type + '_' + str(
                    feat_cnt)

                # initialize or update blast DB
                if init_DB:
                    ref_records = [value[0] for value in symbolDB.values()]
                    write_fasta(db_file, ref_records)
                    try:
                        make_blastDB(db_path, db_file, 'nucl')
                    except Exception:
                        print "failed to make blast DB"
                        exit()
                    init_DB = False

                # first go: add all features as new symbols
                if new_DB:
                    sym_cnt += 1
                    symbol = 'N' + str(sym_cnt)
                    rec.id = symbol
                    symbolDB[symbol] = [rec]
                    g_vector.append(symbol)
示例#15
0
<<<<<<< HEAD
from genomes import all as genomes
from libs.common import load_multifasta, write_fasta

for genome in genomes:
    print genome['file']
    file_path = "data/genomes/"+genome['file']
    outfile_path = "data/renamed/"+genome['file']
    contigs = load_multifasta(file_path)
    renamed = []
    counter = 1
    for contig in contigs:
        contig.id = genome['name']+"_"+str(counter)
        contig_path = "data/contigs/"+contig.id+".fas"
        write_fasta(contig_path, contig)
        renamed.append(contig)
        counter +=1
    write_fasta(outfile_path, renamed)
=======
from sys import argv
from libs.common import load_multifasta, write_fasta, ensure_dir

from genomes import all as genome_list

origin_dir = "data/"+argv[1]+"/"
destin_dir = "data/"+argv[2]+"/"

ensure_dir([destin_dir])

for genome in genome_list:
示例#16
0
## script to combine several fasta files into a single one

import re
from sys import argv
from libs.common import from_dir, load_fasta, load_genbank, write_fasta

origin_dir = "data/" + argv[1]
destin_file = origin_dir + "/" + argv[2] + ".fas"
file_ext = argv[3]

filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext))

records = []

for filename in filenames:
    # load record
    if file_ext == 'fas':
        records.append(load_fasta(origin_dir + "/" + filename))
    elif file_ext == 'gbk':
        records.append(load_genbank(origin_dir + "/" + filename))

    print filename

write_fasta(destin_file, records)
示例#17
0
# script to translate sequences in multifasta files into proteins

from sys import argv
from libs.common import load_multifasta, write_fasta
from Bio.SeqRecord import SeqRecord

origin_dir = "data/" + argv[1] + "/"
in_file = origin_dir + argv[2]
outfile = in_file[:-4] + "_aa.fas"

proteins = []

for record in load_multifasta(in_file):
    aa_rec = SeqRecord(id=record.id, seq=record.seq.translate())
    proteins.append(aa_rec)

write_fasta(outfile, proteins)
示例#18
0
            record = load_genbank(seq_dir+genome['file'])
            select = [feat for feat in record.features
                      if feat.type == feat_type]
            feat_cnt = 0

            # cycle through selected features
            for feat in select:

                feat_cnt +=1
                rec = feat.extract(record)
                rec.description = genome['name']+'_'+feat_type+'_'+str(feat_cnt)

                # initialize or update blast DB
                if init_DB:
                    ref_records = [value[0] for value in symbolDB.values()]
                    write_fasta(db_file, ref_records)
                    try:
                        make_blastDB(db_path, db_file, 'nucl')
                    except Exception:
                        print "failed to make blast DB"
                        exit()
                    init_DB = False

                # first go: add all features as new symbols
                if new_DB:
                    sym_cnt +=1
                    symbol = 'N'+str(sym_cnt)
                    rec.id = symbol
                    symbolDB[symbol] = [rec]
                    g_vector.append(symbol)