Пример #1
0
def main(args=None):

    info()

    if args is None:
        args = sys.argv[1:]

    if "-h" in args or "--help" in args:
        usage()
        sys.exit(2)

    if "-id" in args:
        run_id = get_arg(args, "-id")
    else:
        run_id = str(int(time.time())) # use timestamp as unique run identifier

    if "-bm" in args:
        blast_mode = get_arg(args, "-bm")
    else:
        blast_mode = 'n' # nucleotide blast by default

    if "-resume" in args:
        step = int(get_arg(args, "-resume"))
        resume = True
    else:
        step = 0
        resume = False

    if "-short" in args:
        limit = 6
    else:
        limit = 100 # unnecessarily high cap

    if "-ctg" in args:
        ctg_subset = get_arg(args, "-ctg")
    else:
        ctg_subset = 'exclude'

    if "-g" in args:
        g_select = get_arg(args, "-g")
    else:
        g_select = None

    start_timestamp = str(datetime.now())

    # ensure existence of all directories
    ensure_dir(fixed_dirs.values())
    run_dirs_go = ["".join([r_root_dir, run_id, "/", rdir])
                   for rdir in run_dirs.values()]
    ensure_dir(run_dirs_go)

    # define pickle paths
    pickle_root = r_root_dir+run_id+"/"+run_dirs['pickles']+run_id
    ref_pickles = pickle_root+"_refs.p"
    genome_pickles = pickle_root+"_genomes.p"
    blast_pickles = pickle_root+"_blast.p"
    match_pickles = pickle_root+"_matches.p"

    # check for pickles
    run_refs = []
    run_gs = []
    run_blast = False
    run_matches = []

    if resume:
        # matches
        if step > 4:
            try: run_matches = pickle.load(open(match_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load matches pickle"
                run_matches = []
                step = 4
        # blast
        if step > 3:
            try: run_blast = pickle.load(open(blast_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load blast pickle"
                run_blast = False
                step = 3
        # genomes
        if step > 2:
            try: run_gs = pickle.load(open(genome_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load genomes pickle"
                run_gs = []
                step = 2
        # references
        if step > 1:
            try: run_refs = pickle.load(open(ref_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load refs pickle"
                run_refs = []
                step = 1
    else:
        step = 0     

    ## pipeline

    print "starting pipeline"

    print step, limit

    if resume:
        log_resume_run(run_id, base_root, project_id, start_timestamp, step)

    else:
        print "\n###", step, ". Set up logging & reporting ###\n"
        log_start_run(run_id, base_root, project_id, run_dirs, start_timestamp)
        save_datasumm(run_id, blast_mode, r_root_dir, run_dirs, genomes,
                      references, project_id, project_date, start_timestamp)
        init_reports(run_id, fixed_dirs, ctg_thresholds, start_timestamp)
        step +=1

    while step < limit:

        if step is 1:
            print "\n###", step, ". Prepare references ###\n"
            for ref in references:
                timestamp = str(datetime.now())
                ref_obj = process_ref(ref, ref_annot_flag, r_root_dir,
                                      fixed_dirs, run_dirs, run_id,
                                      timestamp, prot_db_name, project_id)
                run_refs.append(ref_obj)
            if os.path.exists(ref_pickles):
                os.remove(ref_pickles)
            pickle.dump(run_refs, open(ref_pickles, 'wb'))
            step +=1

        elif step is 2:
            print "\n###", step, ". Prepare genomes ###\n"
            for genome in genomes:
                unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds)
                make_genome_DB(genome, fixed_dirs)
            run_gs = add_refs_2g(genomes, references)
            if os.path.exists(genome_pickles):
                os.remove(genome_pickles)
            pickle.dump(run_gs, open(genome_pickles, 'wb'))
            step +=1

        elif step is 3:
            print "\n###", step, ". Blast reference segments against genomes ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                run_blast = basic_batch_blast(run_gs, ref, blast_mode,
                                              r_root_dir, run_dirs,
                                              fixed_dirs, blast_prefs,
                                              run_id, timestamp)  
                if os.path.exists(blast_pickles):
                   os.remove(blast_pickles)
            pickle.dump(run_blast, open(blast_pickles, 'wb'))
            step +=1

        elif step is 4:
            print "\n###", step, ". Collect Blast results ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                ref_hits, ctl_scores = glompX_blast_out(run_gs, ref,
                                                        blast_mode,
                                                        r_root_dir, run_dirs,
                                                        run_id, fixed_dirs,
                                                        blast_dtypes,
                                                        references,
                                                        min_nt_match, 
                                                        min_nt_score,
                                                        min_nt_idp,
                                                        min_aa_match,
                                                        min_aa_score,
                                                        min_aa_idp,
                                                        capture_span,
                                                        timestamp)
                ref_matches = {'ref': ref, 'run': run_id, 'hits': ref_hits,
                               'ctl': ctl_scores}
                run_matches.append(ref_matches)
            if os.path.exists(match_pickles):
                os.remove(match_pickles)
            pickle.dump(run_matches, open(match_pickles, 'wb'))
            step +=1

        elif step is 5:
            print "\n###", step, ". Make match results table & graphs ###\n"
            for match_dict in run_matches:
                timestamp = str(datetime.now())
                matches_table(match_dict, r_root_dir, run_dirs, timestamp)
            step +=1

        ### model evaluation and filtering goes here

        elif step is 6:
            print "\n###", step, ". Annotate matching contigs ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                annot_genome_contigs(ref, prot_db_name, fixed_dirs, r_root_dir,
                                 run_id, run_dirs, genomes, project_id,
                                 timestamp, blast_prefs)
            step +=1

        elif step is 7:
            print "\n###", step, ". Align contigs pairwise to reference ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                align_ctg2ref(ref, run_id, timestamp, r_root_dir, run_dirs, 
                              genomes, mauve_exec, max_size, chop_mode, mtype)
            step +=1

        elif step is 8:
            print "\n###", step, ". Construct backbone-based scaffolds ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                build_scaffolds(ref, r_root_dir, run_dirs, prox_D,
                                separator, genomes, run_id, timestamp,
                                mtype, ctg_subset)
            step +=1

        elif step is 9:
            print "\n###", step, ". Align constructs pairwise to reference ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                align_cstrct2ref(ref, run_id, timestamp, r_root_dir, run_dirs,
                     genomes, max_size, chop_mode, mtype, mauve_exec)
            step +=1

        elif step is 10:
            print "\n###", step, ". Generate maps ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                prep_maps(ref, run_id, timestamp, g_select, r_root_dir,
                          run_dirs, genomes, fixed_dirs, segtype, min_size,
                          fct_flags, fct_colors, idpt)
            step +=1

        elif step > 10:
            break

    stop_timestamp = str(datetime.now())
    log_end_run(run_id, base_root, project_id, stop_timestamp)
    print "\n### Nothing more to do! ###\n"
Пример #2
0
    counter += 1

    while True:
        try:
            fname = EFetcher(rec_id, data_dir)
        except Exception:
            print "Error retrieving record"
            break
        else:
            if rec_id[0:2] == 'NZ':  # disposition for WGS record sets

                print "fetching WGS dataset",

                # create a dedicated directory
                seqdir = data_dir + rec_id + "/"
                ensure_dir([seqdir])

                # open genome record stub to get the contig count
                fname = data_dir + rec_id + ".gbk"
                try:
                    stub = load_genbank(fname)
                except IOError:
                    print "Error loading", fname
                    break

                base_code = stub.annotations['wgs'][0][:10]  # 7 if not NZ_
                ctg_num = int(stub.annotations['wgs'][-1][10:])  # 7

                records = []

                # fetch contig records
Пример #3
0
    ensure_dir, from_dir, read_array, blast_dtypes

data_dir = "data/"+argv[1]+"/"
dir_in = data_dir+argv[2]+"/"
infile = data_dir+argv[3] # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n' # nucleotide blast by default

blast_out = data_dir+"blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.'+file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("."+file_ext)]
    print rec_name,

    genome_path = dir_in+filename
    dbfile_path = "data/blast_db/"+rec_name

    while True:
        if not path.exists(dbfile_path+".nhr"):
Пример #4
0
    counter += 1

    while True:
        try:
            fname = EFetcher(rec_id, data_dir)
        except Exception:
            print "Error retrieving record"
            break
        else:
            if rec_id[0:2] == 'NZ': # disposition for WGS record sets

                print "fetching WGS dataset",

                # create a dedicated directory
                seqdir = data_dir+rec_id+"/"
                ensure_dir([seqdir])

                # open genome record stub to get the contig count
                fname = data_dir+rec_id+".gbk"
                try:
                    stub = load_genbank(fname)
                except IOError:
                    print "Error loading", fname
                    break

                base_code = stub.annotations['wgs'][0][:10] # 7 if not NZ_
                ctg_num = int(stub.annotations['wgs'][-1][10:]) # 7

                records = []

                # fetch contig records
Пример #5
0
    ensure_dir, from_dir, read_array, blast_dtypes

data_dir = "data/" + argv[1] + "/"
dir_in = data_dir + argv[2] + "/"
infile = data_dir + argv[3]  # must be a fasta file with query sequences
file_ext = argv[4]
blast_mode = argv[5]

if len(argv) > 5:
    blast_mode = argv[5]
else:
    blast_mode = 'n'  # nucleotide blast by default

blast_out = data_dir + "blast_out/"

ensure_dir([blast_out])

queries = load_multifasta(infile)

filenames = from_dir(dir_in, re.compile(r'.*\.' + file_ext))

for filename in filenames:

    rec_name = filename[:filename.find("." + file_ext)]
    print rec_name,

    genome_path = dir_in + filename
    dbfile_path = "data/blast_db/" + rec_name

    while True:
        if not path.exists(dbfile_path + ".nhr"):
Пример #6
0
from libs.common import load_genbank, write_fasta, make_blastDB, \
    local_tblastn_2file, read_array, blast_dtypes, ensure_dir
from libs.tetris import segment_finder
from Bio.SeqRecord import SeqRecord
import cPickle as pickle

from sets.NheA_ctxt_set import test as genomes

data_dir = 'data/'+argv[1]+'/'
seq_dir = data_dir+argv[2]+'/'
out_dir = data_dir+argv[3]+'/'
feat_type = argv[4]
threshold = int(argv[5])
min_com = int(argv[6]) # min number of non-core feats in common within groups

ensure_dir([out_dir])

db_file = out_dir+'ref_DB.fas'
db_path = out_dir+'refs'

core_genome_pickle = out_dir+'core_genome.pik'
cluster_set_file = out_dir+'clusters.py'

new_DB = True
init_DB = False

symbolDB = {}
vectorDB = {}
segmentDB = {}

# vectorDB is a dict that contains genome-keyed dicts,
Пример #7
0
# script to capture sequences from the results of a batch blast

from sys import argv
from Bio.SeqRecord import SeqRecord
from libs.common import read_array, blast_dtypes, load_fasta, write_fasta, ensure_dir

data_dir = "data/" + argv[1] + "/"
main_in = data_dir + argv[2] + "_results.txt"
main_out = data_dir + argv[2] + "_ctxt.fas"
ctx_dir = data_dir + "context/"
capture_span = int(argv[3])

ensure_dir([ctx_dir])

records = []

rec_array = read_array(main_in, blast_dtypes)

descript = "Context of " + argv[2] + " (" + argv[3] + " bp either side)"

for line in rec_array:

    query = line[0]
    subject = line[1]

    print subject

    rev_flag = False
    if line[8] < line[9]:
        q_start, q_stop = line[8] - 1, line[9]
        rev_flag = False
Пример #8
0
# script to capture features based on annotation tags

import re
from sys import argv
from libs.common import load_genbank, write_fasta, ensure_dir, from_dir

data_dir = "data/" + argv[1] + "/"
dir_in = "data/" + argv[2] + "/"
feat_type = argv[3]
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir + feat_name + "_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in + "/" + filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
Пример #9
0
import re
from os import path
from sys import argv
from libs.common import from_dir, ensure_dir, fas2gbk, gbk2fas, write_genbank, \
    load_genbank, train_prodigal, run_prodigal, load_multifasta
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import generic_dna

origin_dir = "data/"+argv[1]+"/"
file_ext = argv[2]

annot_gbk_dir = origin_dir+"annot_gbk/"
annot_aa_dir = origin_dir+"annot_aa/"
trn_file = origin_dir+"prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext+'.*'))

for filename in filenames:
    rec_name = filename[:filename.find("."+file_ext)]

    print rec_name, "...",

    # load data
    if file_ext == 'fas':
        fas_file = origin_dir+"/"+filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = origin_dir+"/"+filename
Пример #10
0
# script to capture features based on annotation tags

import re
from sys import argv
from libs.common import load_genbank, write_fasta, ensure_dir, from_dir

data_dir = "data/"+argv[1]+"/"
dir_in = "data/"+argv[2]+"/"
feat_type = argv[3]
feat_tag = argv[4]
feat_name = argv[5]

main_out = data_dir+feat_name+"_seqs.fas"

records = []
ensure_dir([data_dir])

filenames = from_dir(dir_in, re.compile(r'.*\.gbk'))

for filename in filenames:
    rec_name = filename[:filename.find(".gbk")]
    print '.',

    # load data
    record = load_genbank(dir_in+"/"+filename)

    # scan annotations
    for feat in record.features:
        if feat.type == feat_type:
            try:
                if feat_name in feat.qualifiers[feat_tag]:
Пример #11
0
# script to capture sequences from the results of a batch blast

from sys import argv
from Bio.SeqRecord import SeqRecord
from libs.common import read_array, blast_dtypes, load_fasta, write_fasta, ensure_dir

data_dir = "data/"+argv[1]+"/"
main_in = data_dir+argv[2]+"_results.txt"
main_out = data_dir+argv[2]+"_ctxt.fas"
ctx_dir = data_dir+"context/"
capture_span = int(argv[3])

ensure_dir([ctx_dir])

records = []

rec_array = read_array(main_in, blast_dtypes)

descript = "Context of "+argv[2]+" ("+argv[3]+" bp either side)"

for line in rec_array:

    query = line[0]
    subject = line[1]

    print subject

    rev_flag = False
    if line[8] < line[9]:
        q_start, q_stop = line[8]-1, line[9]
        rev_flag = False
Пример #12
0
if len(argv) > 1 and argv[1] == '-h':
    print "Basic usage: \n", \
          "$ python main_script.py [step#]\n"
    exit()

if len(argv) < 2:
    step = 0
else:
    step = int(argv[1])

if step is 0:
    ### STEP 0: Ensure that all base directories exist ###
    print "\n###", step, ". Setting up the work environment ###"
    for dir_name in directories.keys():
        ensure_dir(directories[dir_name])
    step +=1

if step is 1:
    ### STEP 1: Trim & bin reads (based on results of FastQC) ###
    print "\n###", step, ". Trim & bin, then split for batching ###"
    for dataset in datasets:
        dataset['trim_files'] = []
        bin_counts = trim_illumina(dataset)
        print bin_counts # TO LOG
        dataset['mft_files'] = []
        for trim_file in dataset['trim_files']:
            mft_count = simple_q2a(dataset, trim_file)
            #print mft_count # TO LOG
        for mft_file in dataset['mft_files']:
            print mft_file
Пример #13
0
from libs.common import load_genbank, write_fasta, make_blastDB, \
    local_tblastn_2file, read_array, blast_dtypes, ensure_dir
from libs.tetris import segment_finder
from Bio.SeqRecord import SeqRecord
import cPickle as pickle

from sets.NheA_ctxt_set import test as genomes

data_dir = 'data/' + argv[1] + '/'
seq_dir = data_dir + argv[2] + '/'
out_dir = data_dir + argv[3] + '/'
feat_type = argv[4]
threshold = int(argv[5])
min_com = int(argv[6])  # min number of non-core feats in common within groups

ensure_dir([out_dir])

db_file = out_dir + 'ref_DB.fas'
db_path = out_dir + 'refs'

core_genome_pickle = out_dir + 'core_genome.pik'
cluster_set_file = out_dir + 'clusters.py'

new_DB = True
init_DB = False

symbolDB = {}
vectorDB = {}
segmentDB = {}

# vectorDB is a dict that contains genome-keyed dicts,
Пример #14
0
file_ext = argv[3]

if len(argv) < 5:
    trim_ids = ''
else:
    trim_ids = argv[4]

blast_dir = origin_dir+"blast/"
hits_dir = origin_dir+"hits/"
remote_prot_db = "nr"

annot_gbk_dir = origin_dir+"annot_gbk/"
annot_aa_dir = origin_dir+"annot_aa/"
trn_file = origin_dir+"prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir])

filenames = from_dir(seq_dir, re.compile(r'.*\.'+file_ext+'.*'))

for filename in filenames:
    rec_name = filename[:filename.find(trim_ids+"."+file_ext)]

    print rec_name, "..."

    # load data
    if file_ext == 'fas':
        fas_file = seq_dir+"/"+filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = seq_dir+"/"+filename
Пример #15
0
## script to strip trailing tails from genome file names

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/"+argv[1]+"/"
destin_dir = origin_dir+argv[2]+"/"
file_ext = argv[3]
tail = argv[4]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.'+file_ext))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^(.*)'+tail+'\.'+file_ext+'$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter +=1
        new_filename = capture.group(1)+".fas"
        # copy file
        copyfile(origin_dir+filename, destin_dir+new_filename)
        print capture.group(1)

Пример #16
0
def main(args=None):

    info()

    if args is None:
        args = sys.argv[1:]

    if "-h" in args or "--help" in args:
        usage()
        sys.exit(2)

    if "-id" in args:
        run_id = get_arg(args, "-id")
    else:
        run_id = str(int(
            time.time()))  # use timestamp as unique run identifier

    if "-bm" in args:
        blast_mode = get_arg(args, "-bm")
    else:
        blast_mode = 'n'  # nucleotide blast by default

    if "-resume" in args:
        step = int(get_arg(args, "-resume"))
        resume = True
    else:
        step = 0
        resume = False

    if "-short" in args:
        limit = 6
    else:
        limit = 100  # unnecessarily high cap

    if "-filter" in args:
        threshold = int(get_arg(args, "-filter"))
        resume = True
        step = 6
        limit = 7
    else:
        threshold = 5  # reduce for small references
        limit = 100  # unnecessarily high cap

    if "-ctg" in args:
        ctg_subset = get_arg(args, "-ctg")
    else:
        ctg_subset = 'exclude'

    if "-g" in args:
        g_select = get_arg(args, "-g")
    else:
        g_select = None

    start_timestamp = str(datetime.now())

    # ensure existence of all directories
    ensure_dir(fixed_dirs.values())
    run_dirs_go = [
        "".join([r_root_dir, run_id, "/", rdir]) for rdir in run_dirs.values()
    ]
    ensure_dir(run_dirs_go)

    # define pickle paths
    pickle_root = r_root_dir + run_id + "/" + run_dirs['pickles'] + run_id
    ref_pickles = pickle_root + "_refs.p"
    genome_pickles = pickle_root + "_genomes.p"
    blast_pickles = pickle_root + "_blast.p"
    match_pickles = pickle_root + "_matches.p"
    norm_pickles = pickle_root + "_norm.p"

    # check for pickles
    run_refs = []
    run_gs = []
    run_blast = False
    run_matches = []
    run_norm_matches = {}

    if resume:
        # normalized matches
        if step > 5:
            try:
                run_norm_matches = pickle.load(open(norm_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load norm pickle"
                run_norm_matches = {}
                step = 5
        # matches
        if step > 4:
            try:
                run_matches = pickle.load(open(match_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load matches pickle"
                run_matches = []
                step = 4
        # blast
        if step > 3:
            try:
                run_blast = pickle.load(open(blast_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load blast pickle"
                run_blast = False
                step = 3
        # genomes
        if step > 2:
            try:
                run_gs = pickle.load(open(genome_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load genomes pickle"
                run_gs = []
                step = 2
        # references
        if step > 1:
            try:
                run_refs = pickle.load(open(ref_pickles, 'rb'))
            except IOError:
                print "WARNING: Could not load refs pickle"
                run_refs = []
                step = 1
    else:
        step = 0

    ## pipeline

    print "starting pipeline"

    print step, limit

    if resume:
        log_resume_run(run_id, base_root, project_id, start_timestamp, step)

    else:
        print "\n###", step, ". Set up logging & reporting ###\n"
        log_start_run(run_id, base_root, project_id, run_dirs, start_timestamp)
        save_datasumm(run_id, blast_mode, r_root_dir, run_dirs, genomes,
                      references, project_id, project_date, start_timestamp)
        init_reports(run_id, fixed_dirs, ctg_thresholds, start_timestamp)
        step += 1

    while step < limit:

        if step is 1:
            print "\n###", step, ". Prepare references ###\n"
            for ref in references:
                timestamp = str(datetime.now())
                ref_obj = process_ref(ref, ref_annot_flag, r_root_dir,
                                      fixed_dirs, run_dirs, run_id, timestamp,
                                      prot_db_name, project_id)
                run_refs.append(ref_obj)
            if os.path.exists(ref_pickles):
                os.remove(ref_pickles)
            pickle.dump(run_refs, open(ref_pickles, 'wb'))
            step += 1

        elif step is 2:
            print "\n###", step, ". Prepare genomes ###\n"
            for genome in genomes:
                unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds)
                make_genome_DB(genome, fixed_dirs)
            run_gs = add_refs_2g(genomes, references)
            if os.path.exists(genome_pickles):
                os.remove(genome_pickles)
            pickle.dump(run_gs, open(genome_pickles, 'wb'))
            step += 1

        elif step is 3:
            print "\n###", step, ". Blast reference segments against genomes ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                run_blast = basic_batch_blast(run_gs, ref, blast_mode,
                                              r_root_dir, run_dirs, fixed_dirs,
                                              blast_prefs, run_id, timestamp)
                if os.path.exists(blast_pickles):
                    os.remove(blast_pickles)
            pickle.dump(run_blast, open(blast_pickles, 'wb'))
            step += 1

        elif step is 4:
            print "\n###", step, ". Collect Blast results ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                ref_hits, ctl_scores = glompX_blast_out(
                    run_gs, ref, blast_mode, r_root_dir, run_dirs, run_id,
                    fixed_dirs, blast_dtypes, references, min_nt_match,
                    min_nt_score, min_nt_idp, min_aa_match, min_aa_score,
                    min_aa_idp, capture_span, timestamp)
                ref_matches = {
                    'ref': ref,
                    'run': run_id,
                    'hits': ref_hits,
                    'ctl': ctl_scores
                }
                run_matches.append(ref_matches)
            if os.path.exists(match_pickles):
                os.remove(match_pickles)
            pickle.dump(run_matches, open(match_pickles, 'wb'))
            step += 1

        elif step is 5:
            print "\n###", step, ". Make match results table & graphs ###\n"
            for ref_matches in run_matches:
                timestamp = str(datetime.now())
                ref_norm_matches = matches_table(ref_matches, r_root_dir,
                                                 run_dirs, timestamp)
                run_norm_matches[ref_matches['ref'].name] = ref_norm_matches
            if os.path.exists(norm_pickles):
                os.remove(norm_pickles)
            pickle.dump(run_norm_matches, open(norm_pickles, 'wb'))
            step += 1

        elif step is 6:
            print "\n###", step, ". Filter matching contigs ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                filter_contigs(ref, run_id, genomes,
                               run_norm_matches[ref.name], chop_size,
                               threshold, r_root_dir, run_dirs, fixed_dirs,
                               timestamp)
            step += 1

        elif step is 7:
            print "\n###", step, ". Annotate matching contigs ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                annot_genome_contigs(ref, prot_db_name, fixed_dirs, r_root_dir,
                                     run_id, run_dirs, genomes, project_id,
                                     timestamp, blast_prefs)
            step += 1

        elif step is 8:
            print "\n###", step, ". Align contigs pairwise to reference ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                align_ctg2ref(ref, run_id, timestamp, r_root_dir, run_dirs,
                              genomes, mauve_exec, max_size, chop_mode, mtype)
            step += 1

        elif step is 9:
            print "\n###", step, ". Construct backbone-based scaffolds ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                build_scaffolds(ref, r_root_dir, run_dirs, prox_D, separator,
                                genomes, run_id, timestamp, mtype, ctg_subset)
            step += 1

        elif step is 10:
            print "\n###", step, ". Align constructs pairwise to reference ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                align_cstrct2ref(ref, run_id, timestamp, r_root_dir, run_dirs,
                                 genomes, max_size, chop_mode, mtype,
                                 mauve_exec)
            step += 1

        elif step is 11:
            print "\n###", step, ". Generate maps ###\n"
            for ref in run_refs:
                timestamp = str(datetime.now())
                prep_maps(ref, run_id, timestamp, g_select, r_root_dir,
                          run_dirs, genomes, fixed_dirs, segtype, min_size,
                          fct_flags, fct_colors, idpt)
            step += 1

        elif step > 12:
            break

    stop_timestamp = str(datetime.now())
    log_end_run(run_id, base_root, project_id, stop_timestamp)
    print "\n### Nothing more to do! ###\n"
Пример #17
0
## script to strip trailing tails from genome file names

import re
from sys import argv
from libs.common import from_dir, ensure_dir
from shutil import copyfile

origin_dir = "data/" + argv[1] + "/"
destin_dir = origin_dir + argv[2] + "/"
file_ext = argv[3]
tail = argv[4]

ensure_dir([destin_dir])

filenames = from_dir(origin_dir, re.compile(r'.*\.' + file_ext))

counter = 0

for filename in filenames:
    # identify strain name
    pattern = re.compile(r'^(.*)' + tail + '\.' + file_ext + '$')
    capture = re.match(pattern, filename)
    # substitute new name
    if capture:
        counter += 1
        new_filename = capture.group(1) + ".fas"
        # copy file
        copyfile(origin_dir + filename, destin_dir + new_filename)
        print capture.group(1)
Пример #18
0
file_ext = argv[3]

if len(argv) < 5:
    trim_ids = ''
else:
    trim_ids = argv[4]

blast_dir = origin_dir + "blast/"
hits_dir = origin_dir + "hits/"
remote_prot_db = "nr"

annot_gbk_dir = origin_dir + "annot_gbk/"
annot_aa_dir = origin_dir + "annot_aa/"
trn_file = origin_dir + "prodigal.trn"

ensure_dir([annot_gbk_dir, annot_aa_dir, blast_dir, hits_dir])

filenames = from_dir(seq_dir, re.compile(r'.*\.' + file_ext + '.*'))

for filename in filenames:
    rec_name = filename[:filename.find(trim_ids + "." + file_ext)]

    print rec_name, "..."

    # load data
    if file_ext == 'fas':
        fas_file = seq_dir + "/" + filename
        gbk_file = fas2gbk(fas_file)
        record = load_genbank(gbk_file)
    else:
        gbk_file = seq_dir + "/" + filename