예제 #1
0
set_lines = ["all = ["]

filenames = from_dir(seq_dir, re.compile(r'.*\..*'))

counter = 1

for filename in filenames:

    print filename,

    while True:

        if filename.find(".gbk") > 0:
            # process genbank
            try:
                record = load_genbank(seq_dir+filename)
            except IOError:
                print "failed to load Genbank file"
                break
            except Exception:
                print "failed to handle Genbank file"
                break
            else:
                print "...",
                seq_format = 'gbk'

        elif filename.find(".fas") > 0:
            # process fasta (for mfas, load first record)
            try:
                record = load_fasta(seq_dir+filename)
            except IOError:
예제 #2
0
        except Exception:
            print "Error retrieving record"
            break
        else:
            if rec_id[0:2] == 'NZ':  # disposition for WGS record sets

                print "fetching WGS dataset",

                # create a dedicated directory
                seqdir = data_dir + rec_id + "/"
                ensure_dir([seqdir])

                # open genome record stub to get the contig count
                fname = data_dir + rec_id + ".gbk"
                try:
                    stub = load_genbank(fname)
                except IOError:
                    print "Error loading", fname
                    break

                base_code = stub.annotations['wgs'][0][:10]  # 7 if not NZ_
                ctg_num = int(stub.annotations['wgs'][-1][10:])  # 7

                records = []

                # fetch contig records
                ctg_count = 0
                while ctg_count < ctg_num:
                    # TODO: better formatting
                    ctg_count += 1
                    if ctg_count < 10:
예제 #3
0
filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("."+file_ext)]
    print rec_name,

    genome_path = dir_in+filename
    dbfile_path = "data/blast_db/"+rec_name

    while True:
        if not path.exists(dbfile_path+".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
                except IOError:
                    print "failed to load Genbank file"
                    break
                else:
                    try:
                        genome_path = dir_in+rec_name+".fas"
                        write_fasta(genome_path, record)
                    except Exception:
                        print "failed to write Fasta file"
                        break
            try:
                print "making a DB,",
                make_blastDB(dbfile_path, genome_path, 'nucl')
            except IOError:
                print "failed to make DB"
예제 #4
0
        except Exception:
            print "Error retrieving record"
            break
        else:
            if rec_id[0:2] == 'NZ': # disposition for WGS record sets

                print "fetching WGS dataset",

                # create a dedicated directory
                seqdir = data_dir+rec_id+"/"
                ensure_dir([seqdir])

                # open genome record stub to get the contig count
                fname = data_dir+rec_id+".gbk"
                try:
                    stub = load_genbank(fname)
                except IOError:
                    print "Error loading", fname
                    break

                base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_
                ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7

                records = []

                # fetch contig records
                ctg_count = 0
                while ctg_count < ctg_num:
                    # TODO: better formatting
                    ctg_count += 1
                    if ctg_count < 10:
예제 #5
0
filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("." + file_ext)]
    print rec_name,

    genome_path = dir_in + filename
    dbfile_path = "data/blast_db/" + rec_name

    while True:
        if not path.exists(dbfile_path + ".nhr"):
            if file_ext == 'gbk':
                try:
                    print "converting,",
                    record = load_genbank(genome_path)
                except IOError:
                    print "failed to load Genbank file"
                    break
                else:
                    try:
                        genome_path = dir_in + rec_name + ".fas"
                        write_fasta(genome_path, record)
                    except Exception:
                        print "failed to write Fasta file"
                        break
            try:
                print "making a DB,",
                make_blastDB(dbfile_path, genome_path, 'nucl')
            except IOError:
                print "failed to make DB"
예제 #6
0
## script to combine several fasta files into a single one

import re
from sys import argv
from libs.common import from_dir, load_fasta, load_genbank, write_fasta

origin_dir = "data/" + argv[1]
destin_file = origin_dir + "/" + argv[2] + ".fas"
file_ext = argv[3]

filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext))

records = []

for filename in filenames:
    # load record
    if file_ext == 'fas':
        records.append(load_fasta(origin_dir + "/" + filename))
    elif file_ext == 'gbk':
        records.append(load_genbank(origin_dir + "/" + filename))

    print filename

write_fasta(destin_file, records)
예제 #7
0
set_lines = ["all = ["]

filenames = from_dir(seq_dir, re.compile(r'.*\..*'))

counter = 1

for filename in filenames:

    print filename,

    while True:

        if filename.find(".gbk") > 0:
            # process genbank
            try:
                record = load_genbank(seq_dir + filename)
            except IOError:
                print "failed to load Genbank file"
                break
            except Exception:
                print "failed to handle Genbank file"
                break
            else:
                print "...",
                seq_format = 'gbk'

        elif filename.find(".fas") > 0:
            # process fasta (for mfas, load first record)
            try:
                record = load_fasta(seq_dir + filename)
            except IOError:
예제 #8
0
trn_file = origin_dir+"prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext+'.*'))

for filename in filenames:
    rec_name = filename[:filename.find("."+file_ext)]

    print rec_name, "...",

    # load data
    if file_ext == 'fas':
        fas_file = origin_dir+"/"+filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = origin_dir+"/"+filename
        fas_file = gbk2fas(gbk_file)
        record = load_genbank(gbk_file)

    # run prediction
    annot_aa = annot_aa_dir+rec_name+"_ann.fas"
    annot_gbk = annot_gbk_dir+rec_name+"_ann.gbk"
    if not path.exists(trn_file):
        train_prodigal(fas_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(fas_file, annot_gbk, annot_aa, trn_file, "-q")

    # collect orfs
    record.features = []
예제 #9
0
## script to combine several fasta files into a single one

import re
from sys import argv
from libs.common import from_dir, load_fasta, load_genbank, write_fasta

origin_dir = "data/"+argv[1]
destin_file = origin_dir+"/"+argv[2]+".fas"
file_ext = argv[3]

filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext))

records = []

for filename in filenames:
    # load record
    if file_ext == 'fas':
        records.append(load_fasta(origin_dir+"/"+filename))
    elif file_ext == 'gbk':
        records.append(load_genbank(origin_dir+"/"+filename))

    print filename

write_fasta(destin_file, records)
예제 #10
0
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir + feat_name + "_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in + "/" + filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
                    new_rec = feat.extract(record)
                    new_rec.id = rec_name + '_' + feat_name
                    new_rec.description = "Extracted from " + new_rec.description
                    records.append(new_rec)

            except KeyError:
                pass
예제 #11
0
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir+feat_name+"_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in+"/"+filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
                    print '\nfound', feat_name, 'in', rec_name
                    # extract sequence
                    new_rec = feat.extract(record)
                    new_rec.id = rec_name+'_'+feat_name
                    new_rec.description = "Extracted from "+new_rec.description
                    records.append(new_rec)

            except KeyError:
                pass
예제 #12
0
for genome in genome_list:

    print genome['name'],

    origin_file = origin_dir+genome['file']

    while True:

        if genome['input'] == 'cgbk':
            print "ignoring cgbk file"
            break

        elif genome['input'] == 'gbk':
            try:
                records = [load_genbank(origin_file)]
            except IOError:
                print "failed to load file"
                break

        elif genome['input'] == 'fas':
            try:
                records = [load_fasta(origin_file)]
            except IOError:
                print "failed to load file"
                break

        elif genome['input'] == 'mfas':
            try:
                records = load_multifasta(origin_file)
            except IOError:
예제 #13
0
    for genome in genomes:

        g_vector = []

        print genome['name'],

        while True:

            try:
                assert genome['input'] == 'gbk'
            except ValueError:
                print "bad format (skipping)"
                break

            # load genome file to extract features (to proteins in mfas file)
            record = load_genbank(seq_dir + genome['file'])
            select = [
                feat for feat in record.features if feat.type == feat_type
            ]
            feat_cnt = 0

            # cycle through selected features
            for feat in select:

                feat_cnt += 1
                rec = feat.extract(record)
                rec.description = genome['name'] + '_' + feat_type + '_' + str(
                    feat_cnt)

                # initialize or update blast DB
                if init_DB:
예제 #14
0
    for genome in genomes:

        g_vector = []

        print genome['name'],

        while True:

            try:
                assert genome['input'] == 'gbk'
            except ValueError:
                print "bad format (skipping)"
                break

            # load genome file to extract features (to proteins in mfas file)
            record = load_genbank(seq_dir+genome['file'])
            select = [feat for feat in record.features
                      if feat.type == feat_type]
            feat_cnt = 0

            # cycle through selected features
            for feat in select:

                feat_cnt +=1
                rec = feat.extract(record)
                rec.description = genome['name']+'_'+feat_type+'_'+str(feat_cnt)

                # initialize or update blast DB
                if init_DB:
                    ref_records = [value[0] for value in symbolDB.values()]
                    write_fasta(db_file, ref_records)