Exemplo n.º 1
0
def annot_ctg(g_file, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db,
              blast_out, l_tag_base, blast_prefs):
    """Do functional annotation of contig from Fasta file, return record."""
    # gene prediction
    if not path.exists(trn_file):
        train_prodigal(g_file, trn_file, "-q")
    if not path.exists(annot_aa):
        run_prodigal(ctg_fas, annot_gbk, annot_aa, trn_file, "-q")
    # blast the amino acids against COG
    if not path.exists(blast_out):
        local_blastp_2file(annot_aa, prot_db, blast_out, blast_prefs)
    # collect best hits
    rec_cogs = collect_cogs(blast_out)
     # consolidate annotated genbank file
    record = load_fasta(ctg_fas)
    record.features = []
    aa_record = load_multifasta(annot_aa)
    counter = 1
    for aa_rec in aa_record:
        this_prot = 'Query_'+str(counter)
        annotation = rec_cogs[this_prot]
        # get feature details from description line
        # because prodigal output fails to load as valid genbank
        defline = aa_rec.description
        pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+')
        match = pattern.match(defline)
        start_pos = int(match.group(1))
        end_pos = int(match.group(2))
        strand_pos = int(match.group(3))
        feat_loc = FeatureLocation(start_pos, end_pos)
        l_tag = l_tag_base+"_"+str(counter)
        # consolidation feature annotations
        quals = {'note': defline, 'locus_tag': l_tag,
                 'fct': annotation, 'translation': aa_rec.seq}
        feature = SeqFeature(location=feat_loc,
                             strand=strand_pos,
                             id='cds_'+str(counter),
                             type='CDS',
                             qualifiers=quals)
        record.features.append(feature)
        counter +=1
    return record
Exemplo n.º 2
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file']  #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir'] + infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir + g_name + "_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try:
            path.exists(inpath) is True
        except ValueError:
            raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter += 1
            ctg_num = str(counter)
            new_id = g_name + "_" + ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir + new_id + ".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter += 1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name + "_" + ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir + g_name + "_" + ctg_num + ".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format " + genome[
            'input'] + " unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemplo n.º 3
0
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds):
    """Unpack genome files.

    Here, unpacking means extracting data and producing specific files to
    standardize how the information is made available to downstream analysis.
    Depending on the input file format, different unpacking methods are
    invoked. In all cases, this ensures that for each genome, there is a
    multifasta file of the contigs all together as well as a separate Genbank
    file for each contig.

    Supported input file formats are the following:
    - mfas: Basic whole genome sequence in multifasta file of contigs. This
    can be used to process a finished genome in a single Fasta file as well.
    - cgbk: All contigs concatenated in a single GenBank file (Genoscope,
    French WGS). This can be used to process a finished genome in a single
    GanBank file as well.
    # TODO: provide support for other possible input formats

    Unpacking 'cgbk' genomes involves an initial step to detect occurrences
    of the sequence separator and collect the start and stop coordinates of
    each contig. Each pair of coordinates can then be used to extract the
    contig sequence and create a SeqRecord for that contig, which SeqIO
    normally does when it unpacks multifasta files.

    """
    # set up inputs
    infile = genome['file'] #TODO: make GUI input loader (upstream)
    inpath = fixed_dirs['ori_g_dir']+infile
    g_name = genome['name']
    print " ", g_name, "...",
    # prep output destinations
    mfas_dir = fixed_dirs['mfas_contigs_dir']
    fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
    ensure_dir([mfas_dir, fas_dir])
    mfas_file = mfas_dir+g_name+"_contigs.fas"
    records = []
    # select unpacking method
    if genome['input'] is 'fas':
        try: path.exists(inpath) is True
        except ValueError: raise Exception("Bad input file path")
        genome_recs = load_multifasta(inpath)
        # generate GenBank files
        counter = 0
        for rec in genome_recs:
            counter +=1
            ctg_num = str(counter)
            new_id = g_name+"_"+ctg_num  # workaround for long ids
            new_seq = rec.seq
            new_seq.alphabet = generic_dna
            new_rec = SeqRecord(seq=new_seq, id=new_id)
            records.append(new_rec)  # for multifasta output
            fas_file = fas_dir+new_id+".fas"
            write_fasta(fas_file, new_rec)
    elif genome['input'] is 'gbk':
        # load in genome data
        genome_rec = load_genbank(inpath)
        g_string = genome_rec.seq
        # find split coordinates
        coord_pairs = multisplit_finder(g_string, separator)
        # split record
        counter = 0
        for (start, stop) in coord_pairs:
            counter +=1
            ctg_num = str(counter)
            new_record = genome_rec[start:stop]
            new_record.id = g_name+"_"+ctg_num
            records.append(new_record)  # for multifasta output
            fas_file = fas_dir+g_name+"_"+ctg_num+".fas"
            write_fasta(fas_file, new_record)
    else:
        xmsg = "Input file format "+genome['input']+" unspecified/unsupported"
        raise Exception(xmsg)
    print counter, "contigs"
    # write master file
    write_fasta(mfas_file, records)
    # pass records to stats logger
    ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
Exemplo n.º 4
0
def batch_contig_annot(dataset):
    """Extract and annotate contigs."""
    # identify dataset contig file
    contigs_file = dirs['assembly_dir']+dataset['f_nick']+'/'+'contigs.fa'
    # locate the COG database
    cog_db = dirs['blast_db_dir']+'Cog_LE/Cog'
    # make the training file
    training_file = dirs['annot_dir']+dataset['f_nick']+'/'+'contigs.trn'
    #train_prodigal(contigs_file, training_file)
    # set output dirs
    fas_out_dir = dirs['annot_dir']+dataset['f_nick']+'/fasta/'
    gbk_out_dir = dirs['annot_dir']+dataset['f_nick']+'/predict/'
    aa_out_dir = dirs['annot_dir']+dataset['f_nick']+'/aa/'
    blast_out_dir = dirs['annot_dir']+dataset['f_nick']+'/rpsblast/'
    solid_out_dir = dirs['annot_dir']+dataset['f_nick']+'/genbank/'
    maps_out_dir = dirs['annot_dir']+dataset['f_nick']+'/maps/'
    ensure_dir(fas_out_dir)
    ensure_dir(gbk_out_dir)
    ensure_dir(aa_out_dir)
    ensure_dir(blast_out_dir)
    ensure_dir(solid_out_dir)
    # set phage hit collector
    contig_hits = {}
    sp_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                   +dataset['f_nick']+'_kw_hits.html'
    all_hit_list = dirs['annot_dir']+dataset['f_nick']+'/'\
                                    +dataset['f_nick']+'_all_hits.html'
    sp_hit_list_handle = open(sp_hit_list, 'w')
    all_hit_list_handle = open(all_hit_list, 'w')
    sp_hit_list_handle.write("<ul>")
    all_hit_list_handle.write("<ul>")
    # load all contigs
    contigs_list = load_multifasta(contigs_file)
    # cycle through contigs
    ctg_count = 0
    gene_count = 0
    for contig in contigs_list:
        ctg_count +=1
        # use regex to acquire relevant record ID info
        pattern = re.compile(r'NODE_(\d*)_length_(\d*)_cov_(\d*)')
        match = pattern.match(contig.id)
        nick = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
        contig.id = nick
        fasta_out = fas_out_dir+nick+'.fas'
        # write record to file
        write_fasta(fasta_out, contig)
        # create contig entry in dict
        contig_hits[nick] = []
        # run the annotation
        annot_gbk = gbk_out_dir+nick+'.gbk'
        annot_aa = aa_out_dir+nick+'.fas'
        #run_prodigal(fasta_out, annot_gbk, annot_aa, training_file)
        # blast the amino acids against COG
        print '\tblasting', dataset['f_nick'], nick
        blast_out = blast_out_dir+nick+'.xml'
        if path.isfile(blast_out):
            print "\t\talready blasted"
        else:
            local_rpsblast_2file(annot_aa, cog_db, blast_out, blast_prefs)
        # collect best hits
        rec_cogs = collect_cogs(blast_out)
        map_file = maps_out_dir+nick+'.pdf'
        # consolidate annotated genbank file
        record = load_fasta(fasta_out)
        aa_defs = load_multifasta(annot_aa)
        features = []
        counter = 1
        ctg_flag_1 = 0
        ctg_flag_2 = 0
        for protein in aa_defs:
            gene_count +=1
            # get feature details from description line
            # necessary because the prodigal output is not parser-friendly
            pattern = re.compile(r'\d+_\d+_\d+_\d+_\d+\s+\S+\s+(\d+)\s+\S+\s+(\d+)\s+\S+\s+(\S*\d)')
            match = pattern.match(protein.description)
            start_pos = int(match.group(1))
            end_pos = int(match.group(2))
            strand_pos = int(match.group(3))
            feat_loc = FeatureLocation(start_pos, end_pos)
            annotation = rec_cogs['Query_'+str(counter)]
            if ctg_flag_1 is 0:
                all_hit_list_handle.write("</ul><br><a href='"
                                          +"../../../../"
                                          +map_file
                                          +"'>Contig "
                                          +nick+"</a><ul>")
                ctg_flag_1 = 1
            all_hit_list_handle.write("<li>"+str(counter)
                                            +'. '+annotation+"</li>")
            # detect phage content in annotation
            phi_pattern = re.compile(r".+(COG\d+).+"
                                      "(phage|capsid|muramidase|tail|"
                                      "replication|helicase|polymerase|"
                                      "integrase|recombinase"
                                      "suppressor|hydrolase|transposase).+",
                                     re.IGNORECASE)
            phi_match = phi_pattern.match(annotation)
            if phi_match:
                hit_flag = 'on'
                hit_dict = {'CDS': counter,
                            'annot': annotation,
                            'COGs': phi_match.group}
                contig_hits[nick].append(hit_dict)
                # write out to summary file
                if ctg_flag_2 is 0:
                    sp_hit_list_handle.write("</ul><br><a href='"
                                             +"../../../../"
                                             +map_file
                                             +"'>Contig "
                                             +nick+"</a><ul>")
                    ctg_flag_2 = 1
                sp_hit_list_handle.write("<li>"+str(counter)
                                          +'. '+annotation+"</li>")
            else:
                hit_flag = 'off'
            # consolidation feature annotations
            quals = {'note': protein.description,
                     'fct': annotation,
                     'flag': hit_flag}
            feature = SeqFeature(location=feat_loc,
                                 strand=strand_pos,
                                 id=protein.id,
                                 type='CDS',
                                 qualifiers=quals)
            features.append(feature)
            counter +=1
        record.features = features
        record.description = dataset['f_nick']+'_contig_'+nick
        record.name = nick
        record.dbxrefs = ['Project:np1']
        record.seq.alphabet = generic_dna
        gbk_out = solid_out_dir+nick+'.gbk'
        write_genbank(gbk_out, record)
        # generate graphical map
        ContigDraw(nick, gbk_out, map_file)
    sp_hit_list_handle.write("</ul>")
    all_hit_list_handle.write("</ul>")
    sp_hit_list_handle.close()
    all_hit_list_handle.close()
    print "\t", gene_count, "predicted genes in", ctg_count, "contigs"