Пример #1
0
def fas2gbk(fas_file):
    """Convert a FastA file to Genbank format."""
    record = load_fasta(fas_file)
    gbk_file = fas_file[:fas_file.find('.fas')]+'.gbk'
#    record.name = rec_name
#    record.id = rec_name
    record.seq.alphabet = generic_dna
    write_genbank(gbk_file, record)
    return gbk_file
Пример #2
0
    def __init__(self, genome, seq_dir):
        self.name = genome['name']
        self.fas = None
        self.gbk = None
        self.offset = genome['offset']
        self.nudge = genome['nudge']+1
        self.invert = False

        if genome['input'] == 'fas':
            self.fas = seq_dir+genome['file']
            self.gbk = fas2gbk(seq_dir+genome['file'])
        elif genome['input'] == 'gbk':
            self.gbk = seq_dir+genome['file']
            self.fas = gbk2fas(seq_dir+genome['file'])
        else:
            print "ERROR in input format: FastA or Genbank required"
            sys.exit()

        self.len = len(load_fasta(self.fas).seq)
Пример #3
0
def annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db,
              blast_out, l_tag_base, blast_prefs):
    """Do functional annotation of contig from Fasta file, return record."""
    # gene prediction
    if not path.exists(trn_file):
        train_prodigal(g_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(ctg_fas, annot_gbk, annot_aa, trn_file, "-q")
    # blast the amino acids against COG
    if not path.exists(blast_out):
        local_blastp_2file(annot_aa, prot_db, blast_out, blast_prefs)
    # collect best hits
    rec_cogs = collect_cogs(blast_out)
     # consolidate annotated genbank file
    record = load_fasta(ctg_fas)
    record.features = []
    aa_record = load_multifasta(annot_aa)
    counter = 1
    for aa_rec in aa_record:
        this_prot = 'Query_'+str(counter)
        annotation = rec_cogs[this_prot]
        # get feature details from description line
        # because prodigal output fails to load as valid genbank
        defline = aa_rec.description
        pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+')
        match = pattern.match(defline)
        start_pos = int(match.group(1))
        end_pos = int(match.group(2))
        strand_pos = int(match.group(3))
        feat_loc = FeatureLocation(start_pos, end_pos)
        l_tag = l_tag_base+"_"+str(counter)
        # consolidation feature annotations
        quals = {'note': defline, 'locus_tag': l_tag,
                 'fct': annotation, 'translation': aa_rec.seq}
        feature = SeqFeature(location=feat_loc,
                             strand=strand_pos,
                             id='cds_'+str(counter),
                             type='CDS',
                             qualifiers=quals)
        record.features.append(feature)
        counter +=1
    return record
Пример #4
0
    def __init__(self, genome, seq_dir):
        self.name = genome['name']
        self.fas = None
        self.gbk = None
        self.offset = genome['offset']
        self.nudge = genome['nudge'] + 1
        self.invert = False

        self.dir = seq_dir + genome['cat'] + '/'

        if genome['input'] == 'fas':
            self.fas = self.dir + genome['file']
            self.gbk = fas2gbk(self.dir + genome['file'])
        elif genome['input'] == 'gbk':
            self.gbk = self.dir + genome['file']
            self.fas = gbk2fas(self.dir + genome['file'])
        else:
            print "ERROR in input format: FastA or Genbank required"
            sys.exit()

        self.len = len(load_fasta(self.fas).seq)
Пример #5
0
def basic_batch_blast(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                      fixed_dirs, blast_prefs, run_id, timestamp):
    """Send batch jobs to Blast. Muxes to multiple reference DBs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    in_root = run_root+run_dirs['ref_seg_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Blast segs to genomes @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # do blast
    for seg in run_ref.segs:
        input_file = in_root+ref_n+"_"+seg['name']+".fas"
        # translate if required
        if blast_mode == 'tn':
            record = load_fasta(input_file)
            record.seq = record.seq.translate()
            input_file = in_root+ref_n+"_"+seg['name']+"_aa.fas" # substitute
            write_fasta(input_file, record)
        out_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg['name']+"/"
        ensure_dir([out_dir])
        print "\t", seg['name'],
        for genome in genomes:
            g_name = genome['name']
            db_path = fixed_dirs['blast_db_dir']+g_name
            outfile = out_dir+g_name+"_out.txt"
            print ".",
            if blast_mode == 'n':
                local_blastn_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tx':
                local_tblastx_2file(input_file, db_path, outfile, blast_prefs)
            elif blast_mode == 'tn':
                local_tblastn_2file(input_file, db_path, outfile, blast_prefs)
        print ""
    run_ref.log("All OK")
    return "OK"
Пример #6
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    match_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    capture_root = run_root+run_dirs['capture_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/"
        capture_dir = capture_root+"/"+seg_n+"/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root+g_name+"/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir+g_name+"_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir+g_name+"_1.fas",
                             matches_dir+g_name+".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop-q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else: # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length>min_match and score>min_score and idp>min_idp:
                        print "+",
                        p_cnt +=1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'('+contig_id+')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir+match.group(1)+".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir+item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir+contig_id+".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start-capture_span
                                c_stop = q_stop+capture_span
                            else:
                                c_start = q_stop-capture_span
                                c_stop = q_start+capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir+g_name+"_"+contig_id+".fas"
                            cxt_rec = SeqRecord(id=contig_id+"_"
                                                    +str(c_start)+"_"
                                                    +str(c_stop),
                                                seq=contig_rec.seq
                                                    [c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt +=1
                if n_cnt > 0:
                    logstring = "".join(["\t", str(p_cnt), " (",
                                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Пример #7
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root + run_dirs[
            'blast_out_dir'] + ref_n + "/" + seg_n + "/"
        capture_dir = capture_root + "/" + seg_n + "/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root + g_name + "/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir + g_name + "_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir + g_name + "_1.fas",
                             matches_dir + g_name + ".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop - q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else:  # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length > min_match and score > min_score and idp > min_idp:
                        print "+",
                        p_cnt += 1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'(' + contig_id + ')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir + match.group(
                                    1) + ".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir + item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir + contig_id + ".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start - capture_span
                                c_stop = q_stop + capture_span
                            else:
                                c_start = q_stop - capture_span
                                c_stop = q_start + capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir + g_name + "_" + contig_id + ".fas"
                            cxt_rec = SeqRecord(
                                id=contig_id + "_" + str(c_start) + "_" +
                                str(c_stop),
                                seq=contig_rec.seq[c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt += 1
                if n_cnt > 0:
                    logstring = "".join(
                        ["\t", str(p_cnt), " (",
                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Пример #8
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes,
                  mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/"
    segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/"
    q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root + g_name + "/"
        mauve_dir = mauve_root + g_name + "/"
        aln_segs_root = segments_root + g_name + "/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir + item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir + ctg_num + ".mauve"
                aln_segs_dir = aln_segs_root + ctg_num + "/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile + ".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Пример #9
0
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs,
                  genomes, mauve_exec, max_size, chop_mode, mtype):
    """Align contigs pairwise to the reference contig."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    ref_ctg_file = run_ref.file
    mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/"
    segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/"
    q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    ensure_dir([segments_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"])
    run_ref.log(logstring)
    # cycle through genomes
    for genome in genomes:
        # set inputs and outputs
        g_name = genome['name']
        ctgs_fas_dir = q_ctgs_root+g_name+"/"
        mauve_dir = mauve_root+g_name+"/"
        aln_segs_root = segments_root+g_name+"/"
        ensure_dir([mauve_dir])
        print "\t", g_name, "...",
        # log
        logstring = "".join(["\n", g_name])
        run_ref.log(logstring)
        # list genbank files in matches directory
        dir_contents = listdir(ctgs_fas_dir)
        for item in dir_contents:
            pattern = re.compile(r'.*_(\d*)\.fas$')
            match = pattern.match(item)
            if match:
                ctg_num = match.group(1)
                print ctg_num,
                logstring = "".join(["\t", ctg_num])
                run_ref.log(logstring)
                # set inputs and outputs
                q_contig = ctgs_fas_dir+item
                file_list = (ref_ctg_file, q_contig)
                mauve_outfile = mauve_dir+ctg_num+".mauve"
                aln_segs_dir = aln_segs_root+ctg_num+"/"
                ensure_dir([aln_segs_dir])
                segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt"
                open(segfile, 'w').write('')
                # do Mauve alignment
                try:
                    open(ref_ctg_file, 'r')
                    open(q_contig, 'r')
                except IOError:
                    msg = "\nERROR: File missing, cannot align\n\t\t\t"
                    run_ref.log(msg)
                    print msg
                else:
                    align_mauve(file_list, mauve_outfile, mauve_exec)
                    try:
                        # parse Mauve output (without initial clumping)
                        coords = mauver_load2_k0(mauve_outfile+".backbone",
                                                 0, mtype)
                        # chop segments that are too long
                        chop_array = chop_rows(coords, max_size, chop_mode,
                                               mtype)
                        # make detailed pairwise alignments of the segments
                        ref_rec = load_genbank(ref_ctg_file)
                        query_rec = load_fasta(q_contig)
                        iter_align(chop_array, ref_rec, query_rec,
                                   aln_segs_dir, segfile)
                    except IOError:
                        msg = "\nERROR: Mauve alignment failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
                    except Exception:
                        msg = "\nERROR: Iteration failed\n\t\t\t"
                        run_ref.log(msg)
                        print msg
        print ""
Пример #10
0
def filter_contigs(run_ref, run_id, genomes, norm_matches, seg_size, threshold, r_root_dir, run_dirs, fixed_dirs, timestamp):
    """Filter contigs."""
    # set inputs and outputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    fas_root = fixed_dirs['fas_contigs_dir']
    report_root = run_root+run_dirs['reports']+ref_n+"/"
    ensure_dir([report_root])
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Filter contigs @", timestamp, "\n"])
    run_ref.log(logstring)
    # process
    
    # evaluate segment specificity using negative controls
    neg_controls = [genome['name'] for genome in genomes if ('ctrl' in genome.keys() and genome['ctrl'] == 'neg')]
    neg_dat = [norm_matches[g_name]['ctg_scores'] for g_name in neg_controls]
    neg_RA = np.vstack(neg_dat)
    neg_mean = nanmean(neg_RA, axis=0)
    # process the genomes we're testing
    test_genomes = [genome['name'] for genome in genomes if not ('ctrl' in genome.keys())]
    for g_name in test_genomes: 
        print "\t", g_name,
        ctg_hits = norm_matches[g_name]['ctg_scores']
        ctg_stats = {}
        #process individual contigs
        counter = 0
        for ctg_RA in ctg_hits:
            # identify this contig by name
            ctg_name = norm_matches[g_name]['ctg_names'][counter]
            counter += 1
            # subtract background signal from match scores
            recal_ctg_RA = np.subtract(ctg_RA, neg_mean)
            recal_ctg_RA = recal_ctg_RA.clip(min=0)
            # compute total similarity score
            s_score = np.sum(recal_ctg_RA)
            # compute clustering score (primitive)
            streak = False
            c_score = 0
            for hit in recal_ctg_RA:
                if hit == 0:
                    if streak == True:
                        c_score += -1
                        streak = False
                    else: 
                        c_score += 0
                elif hit > 0:
                    if streak == True:
                        c_score += 2
                    else: 
                        c_score += 1
                        streak = True
            # compute backbone vs. cargo burden
            ctg_rec = load_fasta(fas_root+g_name+"/"+ctg_name+".fas")
            bbone = np.sum(np.ma.make_mask(recal_ctg_RA))*seg_size
            if bbone > len(ctg_rec):
                bbone = len(ctg_rec)    # workaround for last segment being always a little short
            cargo = len(ctg_rec) - bbone
            # make inverted array mask (used for redundancy detection)
            ctg_mask = np.ma.getmaskarray(np.ma.masked_equal(recal_ctg_RA,0))
            # consolidate contig information
            ctg_stats[ctg_name] = {'s_score': s_score, 
                                    'c_score': c_score, 
                                    'vector': recal_ctg_RA, 
                                    'inv_mask':ctg_mask,
                                    'bbone': bbone,
                                    'cargo': cargo}
        # detect redundant contigs
        ### use np.ma.mask_or(m1, m2)
        ### if any elements returns false there is a redundancy between two contigs
        ### if so evaluate which has better c_score and s_score
        
        # compute overall stats for the genome
        gs_score = sum([ctg_stats[contig]['s_score'] for contig in ctg_stats])
        gc_score = sum([ctg_stats[contig]['c_score'] for contig in ctg_stats])
        g_bbone = sum([ctg_stats[contig]['bbone'] for contig in ctg_stats])
        g_cargo = sum([ctg_stats[contig]['cargo'] for contig in ctg_stats])
        print gs_score, gc_score, g_bbone, g_cargo,
        # 
        if gs_score > threshold:
            ## run plotters again 
            ## pass the genome on to the next step (others will be dropped)
            print "MATCH"
        else:
            print "(-)"
Пример #11
0
def batch_contig_annot(dataset):
    """Extract and annotate contigs."""
    # identify dataset contig file
    contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa'
    # locate the COG database
    cog_db = dirs['blast_db_dir']+'Cog_LE/Cog'
    # make the training file
    training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn'
    #train_prodigal(contigs_file, training_file)
    # set output dirs
    fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/'
    gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/'
    aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/'
    blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/'
    solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/'
    maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/'
    ensure_dir(fas_out_dir)
    ensure_dir(gbk_out_dir)
    ensure_dir(aa_out_dir)
    ensure_dir(blast_out_dir)
    ensure_dir(solid_out_dir)
    # set phage hit collector
    contig_hits = {}
    sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                   +dataset['f_nick']+'_kw_hits.html'
    all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                    +dataset['f_nick']+'_all_hits.html'
    sp_hit_list_handle = open(sp_hit_list, 'w')
    all_hit_list_handle = open(all_hit_list, 'w')
    sp_hit_list_handle.write("<ul>")
    all_hit_list_handle.write("<ul>")
    # load all contigs
    contigs_list = load_multifasta(contigs_file)
    # cycle through contigs
    ctg_count = 0
    gene_count = 0
    for contig in contigs_list:
        ctg_count +=1
        # use regex to acquire relevant record ID info
        pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)')
        match = pattern.match(contig.id)
        nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
        contig.id = nick
        fasta_out = fas_out_dir+nick+'.fas'
        # write record to file
        write_fasta(fasta_out, contig)
        # create contig entry in dict
        contig_hits[nick] = []
        # run the annotation
        annot_gbk = gbk_out_dir+nick+'.gbk'
        annot_aa = aa_out_dir+nick+'.fas'
        #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file)
        # blast the amino acids against COG
        print '\tblasting', dataset['f_nick'], nick
        blast_out = blast_out_dir+nick+'.xml'
        if path.isfile(blast_out):
            print "\t\talready blasted"
        else:
            local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs)
        # collect best hits
        rec_cogs = collect_cogs(blast_out)
        map_file = maps_out_dir+nick+'.pdf'
        # consolidate annotated genbank file
        record = load_fasta(fasta_out)
        aa_defs = load_multifasta(annot_aa)
        features = []
        counter = 1
        ctg_flag_1 = 0
        ctg_flag_2 = 0
        for protein in aa_defs:
            gene_count +=1
            # get feature details from description line
            # necessary because the prodigal output is not parser-friendly
            pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)')
            match = pattern.match(protein.description)
            start_pos = int(match.group(1))
            end_pos = int(match.group(2))
            strand_pos = int(match.group(3))
            feat_loc = FeatureLocation(start_pos, end_pos)
            annotation = rec_cogs['Query_'+str(counter)]
            if ctg_flag_1 is 0:
                all_hit_list_handle.write("</ul><br><a href='"
                                          +"../../../../"
                                          +map_file
                                          +"'>Contig "
                                          +nick+"</a><ul>")
                ctg_flag_1 = 1
            all_hit_list_handle.write("<li>"+str(counter)
                                            +'. '+annotation+"</li>")
            # detect phage content in annotation
            phi_pattern = re.compile(r".+(COG\d+).+"
                                      "(phage|capsid|muramidase|tail|"
                                      "replication|helicase|polymerase|"
                                      "integrase|recombinase"
                                      "suppressor|hydrolase|transposase).+",
                                     re.IGNORECASE)
            phi_match = phi_pattern.match(annotation)
            if phi_match:
                hit_flag = 'on'
                hit_dict = {'CDS': counter,
                            'annot': annotation,
                            'COGs': phi_match.group}
                contig_hits[nick].append(hit_dict)
                # write out to summary file
                if ctg_flag_2 is 0:
                    sp_hit_list_handle.write("</ul><br><a href='"
                                             +"../../../../"
                                             +map_file
                                             +"'>Contig "
                                             +nick+"</a><ul>")
                    ctg_flag_2 = 1
                sp_hit_list_handle.write("<li>"+str(counter)
                                          +'. '+annotation+"</li>")
            else:
                hit_flag = 'off'
            # consolidation feature annotations
            quals = {'note': protein.description,
                     'fct': annotation,
                     'flag': hit_flag}
            feature = SeqFeature(location=feat_loc,
                                 strand=strand_pos,
                                 id=protein.id,
                                 type='CDS',
                                 qualifiers=quals)
            features.append(feature)
            counter +=1
        record.features = features
        record.description = dataset['f_nick']+'_contig_'+nick
        record.name = nick
        record.dbxrefs = ['Project:np1']
        record.seq.alphabet = generic_dna
        gbk_out = solid_out_dir+nick+'.gbk'
        write_genbank(gbk_out, record)
        # generate graphical map
        ContigDraw(nick, gbk_out, map_file)
    sp_hit_list_handle.write("</ul>")
    all_hit_list_handle.write("</ul>")
    sp_hit_list_handle.close()
    all_hit_list_handle.close()
    print "\t", gene_count, "predicted genes in", ctg_count, "contigs"