示例#1
0
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict,
                                       seq_record, options):
    #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
    krnames = []
    krseqs = []
    logging.info("Predicting PKS KR activity and stereochemistry using KR " \
        "fingerprints from Starcevic et al.")
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_KR":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_KR" + str(nr)
                krnames.append(name)
                krseqs.append(seq)
    if len(krnames) > 0:
        utils.writefasta(
            krnames, krseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_krseqs.fasta")
        with TemporaryDirectory(change=True):
            kr_analysis.run_kr_analysis(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krpredoutput.txt")
    return krnames, krseqs
示例#2
0
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene,
                      feature_by_id):
    #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ntermintresdict = {}
    ntermnames = []
    ntermseqs = []
    nterm_file = os.path.join(da_dir, 'nterm.fasta')
    for k in clusterpksgenes:
        if k != startergene:
            ntermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ntermseqs.append(seq[:50])
    ntermfasta = "input.fasta"
    z = 0
    for k in ntermnames:
        utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [2, 15],
                                       "EryAIII_5_6_ref", ntermnames[z])
        ntermintresdict[ntermnames[z]] = intresidues
        z += 1
    return ntermintresdict
示例#3
0
def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene,
                      feature_by_id):
    #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ctermintresdict = {}
    ctermnames = []
    ctermseqs = []
    cterm_file = os.path.join(da_dir, 'cterm.fasta')
    for k in clusterpksgenes:
        if k != endinggene:
            ctermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ctermseqs.append(seq[-100:])
    ctermfasta = "input.fasta"
    z = 0
    for k in ctermnames:
        utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref",
                                       ctermnames[z])
        ctermintresdict[ctermnames[z]] = intresidues
        z += 1
    return ctermintresdict
示例#4
0
def parse_subject(tabs, seqlengths, geneclustergenes, seq_record):
    if len(tabs) < 12:
        logging.error("Malformed blast pairing: %s", "\t".join(tabs))
    query = tabs[0]
    subject_parts = tabs[1].split("|")
    subject = subject_parts[4]
    if subject == "no_locus_tag":
        subject = subject_parts[6]
    if subject in geneclustergenes:
        subject = "h_" + subject
    if len(subject_parts) > 6:
        locustag = subject_parts[6]
    else:
        locustag = ""
    genecluster = "{}_{}".format(subject_parts[0], subject_parts[1])
    start, end = subject_parts[2].split("-")[:2]
    strand = subject_parts[3]
    annotation = subject_parts[5]
    perc_ident = int(float(tabs[2]) + 0.5)
    evalue = str(tabs[10])
    blastscore = int(float(tabs[11]) + 0.5)
    query_key = query.split("|")[4]
    if seqlengths.has_key(query_key):
        perc_coverage = (float(tabs[3]) / seqlengths[query_key]) * 100
    else:
        feature_by_id = utils.get_feature_dict_protein_id(seq_record)
        seqlength = len(utils.get_aa_sequence(feature_by_id[query_key]))
        perc_coverage = (float(tabs[3]) / seqlength) * 100
    return Subject(subject, genecluster, start, end, strand, annotation,
                   perc_ident, blastscore, perc_coverage, evalue, locustag)
示例#5
0
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record,
                                 options):
    calnames = []
    calseqs = []
    #Predict PKS CAL domain specificities with Minowa et al. method
    logging.info(
        "Predicting CAL domain substrate specificities by Minowa et al. method"
    )
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "CAL_domain":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_CAL" + str(nr)
                calnames.append(name)
                calseqs.append(seq)
    if len(calnames) > 0:
        utils.writefasta(
            calnames, calseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_calseqs.fasta")
        with TemporaryDirectory(change=True):
            minowa_CAL.run_minowa_cal(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_calseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_minowa_calpredoutput.txt")
    return calnames, calseqs
示例#6
0
def find_lan_a_features(seq_record, cluster):
    lan_a_features = []
    for feature in utils.get_cds_features(seq_record):
        if feature.location.start < cluster.location.start or \
           feature.location.end > cluster.location.end:
            continue

        aa_seq = utils.get_aa_sequence(feature)
        if len(aa_seq) < 80:
            lan_a_features.append(feature)
            continue

        if not 'sec_met' in feature.qualifiers:
            continue

        domain = None
        for entry in feature.qualifiers['sec_met']:
            if entry.startswith('Domains detected:'):
                domain = entry.split()[2]
                break

        if domain is None:
            continue

        if domain not in known_precursor_domains:
            continue

        lan_a_features.append(feature)

    return lan_a_features
示例#7
0
def fastaseqlengths(seq_record):
    seqlengths = {}
    cdsfeatures = utils.get_cds_features(seq_record)
    for cds in cdsfeatures:
        seqlength = len(str(utils.get_aa_sequence(cds)))
        seqlengths[utils.get_gene_acc(cds)] = seqlength
    return seqlengths
示例#8
0
def generate_searchgtr_htmls(seq_records, options):
    #Generate lists of COGs that are glycosyltransferases or transporters
    gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102']
    searchgtrformtemplateparts = load_searchgtr_search_form_template()
    options.searchgtr_links = {}
    for seq_record in seq_records:
        smcogdict, _ = utils.get_smcog_annotations(seq_record)
        for feature in utils.get_cds_features(seq_record):
            gene_id = utils.get_gene_id(feature)
            if smcogdict.has_key(gene_id):
                smcog = smcogdict[gene_id]
                if smcog in gtrcoglist:

                    if not os.path.exists(options.full_outputfolder_path +
                                          os.sep + "html"):
                        os.mkdir(options.full_outputfolder_path + os.sep +
                                 "html")
                    formfileloc = options.full_outputfolder_path + os.sep + "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    link_loc = "html" + os.sep + utils.get_gene_id(
                        feature) + "_searchgtr.html"
                    options.searchgtr_links[seq_record.id + "_" +
                                            gene_id] = link_loc
                    formfile = open(formfileloc, "w")
                    specificformtemplate = searchgtrformtemplateparts[
                        0].replace("GlycTr", gene_id)
                    formfile.write(specificformtemplate)
                    formfile.write("%s\n%s" %
                                   (gene_id, utils.get_aa_sequence(feature)))
                    formfile.write(searchgtrformtemplateparts[1])
                    formfile.close()
示例#9
0
def _parse_domain(domain, feature, seq_record):
    "Convert a NRPS/PKS domain string to a dict useable by json.dumps"
    text = domain[17:]
    type_, location, prediction_string = text.split(' ', 2)
    predictions = _parse_substrate_predictions(prediction_string)

    location = location.strip('().')
    coordinates = location.split('-')

    #Create url_link to NaPDoS for C and KS domains
    napdoslink = ""
    domainseq = str(utils.get_aa_sequence(
        feature))[int(coordinates[0]):int(coordinates[-1])]
    if "PKS_KS" in text:
        napdoslink = "http://napdos.ucsd.edu/cgi-bin/process_request.cgi?query_type=aa&amp;ref_seq_file=all_KS_public_12062011.faa&amp;Sequence=%3EKS_domain_from_antiSMASH%0D" + domainseq
    elif "Condensation" in text:
        napdoslink = "http://napdos.ucsd.edu/cgi-bin/process_request.cgi?query_type=aa&amp;ref_seq_file=all_C_public_12062011.faa&amp;Sequence=%3EC_domain_from_antiSMASH%0D" + domainseq
    blastlink = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&amp;PROGRAM=blastp&amp;BLAST_PROGRAMS=blastp&amp;QUERY=" + domainseq + "&amp;LINK_LOC=protein&amp;PAGE_TYPE=BlastSearch"

    try:
        js_domain = {
            'type': type_,
            'start': int(coordinates[0]),
            'end': int(coordinates[1]),
            'predictions': predictions,
            'napdoslink': napdoslink,
            'blastlink': blastlink,
            'sequence': domainseq
        }
        return js_domain
    except ValueError:
        logging.debug('%r' % text)
        logging.debug('%r  %r' % (type_, location))
        logging.debug(coordinates)
        raise
示例#10
0
def generate_details_div(cluster,
                         seq_record,
                         options,
                         js_domains,
                         details=None):
    """Generate details div"""

    cluster_rec = utils.get_cluster_by_nr(seq_record, cluster['idx'])
    if cluster_rec is None:
        return details

    if details is None:
        details = pq('<div>')
        details.addClass('details')

        header = pq('<h3>')
        header.text('Detailed annotation')
        details.append(header)

    js_cluster_domains = {
        'id': "cluster-%s-details" % cluster['idx'],
        'orfs': []
    }
    features = utils.get_cluster_cds_features(cluster_rec, seq_record)
    for feature in features:
        if not 'sec_met' in feature.qualifiers:
            continue

        if 'translation' in feature.qualifiers:
            sequence = feature.qualifiers['translation'][0]
        else:
            sequence = str(utils.get_aa_sequence(feature))

        js_orf = {
            'id': utils.get_gene_id(feature),
            'sequence': sequence,
            'domains': [],
        }

        for qual in feature.qualifiers['sec_met']:
            if not qual.startswith('NRPS/PKS Domain:'):
                continue

            js_domain = _parse_domain(qual, feature, seq_record)
            if len(js_domain) > 0:
                js_orf['domains'].append(js_domain)

        if len(js_orf['domains']) > 0:
            js_cluster_domains['orfs'].append(js_orf)

    if len(js_cluster_domains['orfs']) > 0:
        details_svg = pq('<div>')
        details_svg.addClass('details-svg')
        details_svg.attr('id', '%s-svg' % js_cluster_domains['id'])
        details.append(details_svg)

        js_domains.append(js_cluster_domains)

    return details
示例#11
0
    def test_get_aa_sequence(self):
        "Test utils.get_aa_sequence() for straightforward translation"
        expected = 'MAGIC'
        f = FakeFeature("CDS")
        f.qualifiers['translation'] = [expected]

        ret = utils.get_aa_sequence(f)
        self.assertEqual(expected, ret)
示例#12
0
def filter_overlap(cdsfeatures):
    #For groups of overlapping CDSs (e.g., alternative transcripts?), only use the longest one
    uniquecdsfeatures = []
    overlapping_groups = find_overlapping_groups(cdsfeatures)
    for group in overlapping_groups:
        lengths = [len(utils.get_aa_sequence(feature)) for feature in group]
        longest_idx = lengths.index(max(lengths))
        uniquecdsfeatures.append(group[longest_idx])
    return uniquecdsfeatures
示例#13
0
    def test_get_aa_sequence_to_stop(self):
        "Test utils.get_aa_sequence() for translation up to a stop codon"
        inseq = 'MAGIC*SEQ'
        expected = 'MAGIC'
        f = FakeFeature("CDS")
        f.qualifiers['translation'] = [inseq]

        ret = utils.get_aa_sequence(f, to_stop=True)
        self.assertEqual(expected, ret)
示例#14
0
    def test_get_aa_sequence_gap(self):
        "Test utils.get_aa_sequence() for translation including a gap"
        inseq = 'MA-GIC'
        expected = 'MAGIC'
        f = FakeFeature("CDS")
        f.qualifiers['translation'] = [inseq]

        ret = utils.get_aa_sequence(f)
        self.assertEqual(expected, ret)
示例#15
0
def smcog_analysis(inputgenes, inputnr, seq_record, smcogdict, smcogsoutputfolder):
    "run smCOG search on all gene cluster CDS features"
    for feature in inputgenes:
        k = utils.get_gene_id(feature)
        tag = k
        seq = str(utils.get_aa_sequence(feature))
        #create input.fasta file with single query sequence to be used as input for MSA
        utils.writefasta([tag], [seq], "input" + str(inputnr) + ".fasta")
        if smcogdict.has_key(k) and len(smcogdict[k]) > 0:
            smcog = (smcogdict[k][0][0]).split(":")[0]
            alignsmcogs(smcog, inputnr)
            #Generate trimmed alignment
            trimalignment(inputnr)
            #Draw phylogenetic tree
            drawtree(inputnr)
            #Convert tree to draw PNG image
            converttree(inputnr, smcogsoutputfolder, tag)
示例#16
0
def extract_nrps_genes(pksnrpscoregenes, domaindict, seq_record, extra_aa=0):
    nrpsnames = []
    nrpsseqs = []
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "AMP-binding" or tab[0] == "A-OX":
                nr += 1
                start = int(tab[1])
                end = int(tab[2]) + extra_aa
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_A" + str(nr)
                nrpsnames.append(name)
                nrpsseqs.append(seq)
    return nrpsnames, nrpsseqs
示例#17
0
def extract_pks_genes(pksnrpscoregenes, domaindict, seq_record):
    pksnames = []
    pksseqs = []
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_AT":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_AT" + str(nr)
                pksnames.append(name)
                pksseqs.append(seq)
    return pksnames, pksseqs
示例#18
0
def tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record):
    #Filters blastlines to get rid of hits that do not meet criteria
    blastlines2 = []
    for i in blastlines:
        tabs = i.split("\t")
        query = tabs[0]
        perc_ident = int(float(tabs[2]) + 0.5)
        alignmentlength = float(tabs[3])
        if seqlengths.has_key(query.split("|")[4]):
            perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100
        else:
            feature_by_id = utils.get_feature_dict_protein_id(seq_record)
            seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]]))
            perc_coverage = (float(tabs[3]) / seqlength) * 100
        if perc_ident > minpercidentity and (perc_coverage > minseqcoverage):
            blastlines2.append(i)
    return blastlines2
示例#19
0
def create_blast_inputs(genecluster, seq_record):
    #Create input fasta files for BLAST search
    queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.nofuzzy_start) + "-" + \
                             str(cds.location.nofuzzy_end), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
示例#20
0
def _get_nrpspks_domains_ks(pksnrpsvars, seq_record, domain):
    transatpks_geneclusters = _get_transatpks_geneclusters(pksnrpsvars, seq_record)
    transatpks_genes = list(set([g for g_list in transatpks_geneclusters.values() for g in g_list]))
    ksnames = []
    ksseqs = []
    if len(transatpks_geneclusters) >= 1:
        job_id = seq_record.id
        for feature in pksnrpsvars.pksnrpscoregenes:
            start_cds = str(feature.location.nofuzzy_start)
            end_cds = str(feature.location.nofuzzy_end)
            strand = feature.location.strand
            if strand == 1:
                strand_char = '+'
            else:
                strand_char = '-'
            loc = '-'.join((start_cds, end_cds))
            prot_id = product = ''
            if 'protein_id' in feature.qualifiers:
                prot_id = feature.qualifiers["protein_id"][0]
            if 'product' in feature.qualifiers:
                product = feature.qualifiers["product"][0].replace(' ', '_').replace('|', '')
                # We use | as a separator later
                assert '|' not in product, product
            gene_id = utils.get_gene_id(feature)
            if gene_id in transatpks_genes:
                domaindetails = pksnrpsvars.domaindict[gene_id]
                nr = 0
                for tab in domaindetails:
                    if tab[0] == domain:
                        nr += 1
                        start = int(tab[1])
                        end = int(tab[2])
                        loc_domain = '-'.join((str(start), str(end)))
                        ks_index = ''.join(('KS', str(nr)))
                        name1 = '|'.join(
                            [''.join(['>', job_id]), 'c', loc, strand_char, gene_id, product, prot_id, loc_domain, ks_index])
                        name = re.sub(r'(\:|\'|\(|\)|\,|\?|\;)', '', name1)
                        seq = str(utils.get_aa_sequence(feature))[start:end]
                        ksnames.append(name)
                        ksseqs.append(seq)
    return ksnames, ksseqs
示例#21
0
def run_lantipred(seq_record, query, lant_class):
    hmmer_profiles = {
        'Class-I': 'class1.hmm',
        'Class-II': 'class2.hmm',
        'Class-III': 'class3.hmm',
    }

    query_sequence = utils.get_aa_sequence(query, to_stop=True)
    lan_a_fasta = ">%s\n%s" % (utils.get_gene_id(query), query_sequence)

    #run sequence against profiles and parse them in a vector containing START, END, SCORE and LANTYPE
    profile = utils.get_full_path(__file__, hmmer_profiles[lant_class])
    result = predict_cleavage_site(profile, lan_a_fasta)

    if result is None:
        logging.debug('%r: No cleavage site predicted' %
                      utils.get_gene_id(query))
        return

    if thresh_dict[lant_class] > result.score:
        logging.debug('%r: Score %0.2f below threshold %0.2f for class %r' %
                      (utils.get_gene_id(query), result.score,
                       thresh_dict[lant_class], lant_class))
        return

    #extract now (that class is known and thus the END component) the core peptide
    result.leader = query_sequence[:result.end]
    result.core = query_sequence[result.end:]
    if result.core.find('C') < 0:
        logging.debug(
            '%r: No Cysteine residues found in core, false positive' %
            utils.get_gene_id(query))
        return
    if not 'sec_met' in query.qualifiers:
        query.qualifiers['sec_met'] = []

    if ";".join(query.qualifiers['sec_met']).find(';Kind: biosynthetic') < 0:
        query.qualifiers['sec_met'].append('Kind: biosynthetic')

    return result
示例#22
0
def create_blast_inputs(genecluster, seq_record):
    options = config.get_config()
    #Create input fasta files for BLAST search
    if options.taxon == "plants":
        queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record))
    else:
        queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.start).replace(">","").replace("<","") + "-" + \
                             str(cds.location.end).replace(">","").replace("<",""), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
示例#23
0
def filter_nonterminal_docking_domains(seq_record, pksnrpsvars):
    dockingdomains = [
        'NRPS-COM_Nterm', 'NRPS-COM_Cterm', 'PKS_Docking_Cterm',
        'PKS_Docking_Nterm'
    ]
    hitgenes = pksnrpsvars.domaindict.keys()
    feature_by_id = utils.get_feature_dict(seq_record)
    for hitgene in hitgenes:
        to_remove = []
        cdsfeature = feature_by_id[hitgene]
        cds_seq = utils.get_aa_sequence(cdsfeature)
        hitgenelength = len(cds_seq)
        x = 0
        for hit in pksnrpsvars.domaindict[hitgene]:
            if hit[0] in dockingdomains:
                if not (hitgenelength - max(hit[1], hit[2]) < 50
                        or min(hit[1], hit[2]) < 50):
                    to_remove.append(x)
            x += 1
        to_remove.reverse()
        for idx in to_remove:
            del pksnrpsvars.domaindict[hitgene][idx]
        if pksnrpsvars.domaindict[hitgene] == []:
            del pksnrpsvars.domaindict[hitgene]
示例#24
0
def blastparse(blasttext, minseqcoverage, minpercidentity, seqlengths, seq_record):
    options = config.get_config()
    geneclustergenes = [utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record)]
    blastdict = {}
    querylist = []
    hitclusters = []
    blastlines = blasttext.split("\n")[:-1]
    blastlines = uniqueblasthitfilter(blastlines)
    blastlines = tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record)
    #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query
    firstquery = "y"
    percid_per_cluster = {}
    for i in blastlines:
        tabs = i.split("\t")
        query = tabs[0]
        subject = tabs[1].split("|")[4]
        if subject == "no_locus_tag":
            subject = tabs[1].split("|")[6]
        if subject in geneclustergenes:
            subject = "h_" + subject
        if len(tabs[1].split("|")) > 6:
            locustag = tabs[1].split("|")[6]
        else:
            locustag = ""
        subject_genecluster = tabs[1].split("|")[0] + "_" + tabs[1].split("|")[1]
        subject_start = (tabs[1].split("|")[2]).split("-")[0]
        subject_end = (tabs[1].split("|")[2]).split("-")[1]
        subject_strand  = tabs[1].split("|")[3]
        subject_annotation = tabs[1].split("|")[5]
        perc_ident = int(float(tabs[2]) + 0.5)
        evalue = str(tabs[10])
        blastscore = int(float(tabs[11])+0.5)
        if seqlengths.has_key(query.split("|")[4]):
            perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100
        else:
            feature_by_id = utils.get_feature_dict_protein_id(seq_record)
            seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]]))
            perc_coverage = (float(tabs[3]) / seqlength) * 100
        if firstquery == "y": #Only until the first blastline with good hit
            firstquery = "n"
            querylist.append(query)
            subjectlist = []
            querydict = {}
            subjectlist.append(subject)
            querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
            if subject_genecluster not in hitclusters:
                percid_per_cluster[subject_genecluster] = [perc_ident]
                hitclusters.append(subject_genecluster)
            last_query = query
        elif i == blastlines[-1]: #Only for the last blastline
            if query not in querylist:
                subjectlist = []
                querydict = {}
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                blastdict[query] = [subjectlist,querydict]
                querylist.append(query)
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
            else:
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                blastdict[query] = [subjectlist,querydict]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
        else: #For all but the first and last blastlines
            if query not in querylist:
                blastdict[last_query] = [subjectlist,querydict]
                querylist.append(query)
                subjectlist = []
                querydict = {}
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
                last_query = query
            else:
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
    #For plants, filter hitclusters to only keep those hits with at least one hit > 60% ID
    if options.taxon == "plants":
        hitclusters = [cluster for cluster in hitclusters if len([int(pid) for pid in percid_per_cluster[cluster] if int(pid) > 60]) > 0]
    return [blastdict,querylist,hitclusters]
示例#25
0
文件: js.py 项目: chevrm/transPACT
def get_description(record, feature, type_, options):
    "Get the description text of a feature"

    replacements = {
        'locus_tag': ", ".join(feature.qualifiers.get('locus_tag', ['-'])),
        'protein_id': ", ".join(feature.qualifiers.get('protein_id', ['-'])),
        'smcog': '-',
        'ecnumber': '-',
        'transport_blast_line': '',
        'smcog_tree_line': '',
        'searchgtr_line': '',
        'start': int(feature.location.start) + 1,
        'end': int(feature.location.end),
        'model_details': get_model_details(feature),
        'asf': ''
    }

    blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \
                 "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \
                 "LINK_LOC=protein&PAGE_TYPE=BlastSearch"
    genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \
                          "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\
                          "id=%s&from=%s&to=%s"
    template = '<span class="svgene-tooltip-bold">%(product)s</span><br>\n'
    template += 'Locus-tag: %(locus_tag)s; Protein-ID: %(protein_id)s<br>\n'
    if 'EC_number' in feature.qualifiers:
        template += "EC-number(s): %(ecnumber)s<br>\n"
    if options.smcogs:
        template += "smCOG: %(smcog)s<br>\n"
    if options.input_type == 'nucl':
        template += "Location: %(start)s - %(end)s<br><br>\n"
    if 'sec_met' in feature.qualifiers:
        template += '<span class="bold">Signature pHMM hits:</span><br>\n%(model_details)s<br>\n'

    if options.knownclusterblast:

        mibig_homology_path = glob(
            os.path.join(options.full_outputfolder_path, "knownclusterblast",
                         "cluster*",
                         utils.get_gene_acc(feature) + '_mibig_hits.txt'))
        if mibig_homology_path:
            mibig_homology_file = mibig_homology_path[0]
            generate_html_table(mibig_homology_file)
            html_file = mibig_homology_file.split('.txt')[0] + '.html'
            replacements['mibig_homology_path'] = html_file[
                len(options.full_outputfolder_path) + 1:]
            template += '<a href="%(mibig_homology_path)s" target="_new">MiBIG Hits</a><br><br>\n'
    template += """
%(transport_blast_line)s
%(searchgtr_line)s
<a href="%(blastp_url)s" target="_new">NCBI BlastP on this gene</a><br>
<a href="%(genomic_context_url)s" target="_new">View genomic context</a><br>
%(smcog_tree_line)s<br>"""
    if not get_ASF_predictions(feature) == "":
        template += '<span class="bold">Active Site Finder results:</span><br>\n%(asf)s<br><br>\n'
    template += """AA sequence: <a href="javascript:copyToClipboard('%(sequence)s')">Copy to clipboard</a><br>"""

    if not options.smcogs:
        del replacements['smcog']
    if options.input_type == 'prot':
        del replacements['start']
        del replacements['end']

    replacements['product'] = feature.qualifiers.get('product', ['-'])[0]
    if 'translation' in feature.qualifiers:
        sequence = feature.qualifiers['translation'][0]
    else:
        sequence = str(utils.get_aa_sequence(feature))
    replacements['blastp_url'] = blastp_url % sequence
    replacements['sequence'] = sequence
    if len(sequence) > 2000:
        len_seq = 30
    else:
        len_seq = (len(sequence) / 80) + 1
    replacements['len_seq'] = len_seq
    replacements['genomic_context_url'] = genomic_context_url % \
                    ( record.id,
                      max(feature.location.start - 9999, 0),
                      min(feature.location.end + 10000, len(record)) )
    if 'EC_number' in feature.qualifiers:
        replacements['ecnumber'] = ", ".join(
            feature.qualifiers.get('EC_number', ['-']))
    else:
        del replacements['ecnumber']

    if options.smcogs:
        for note in feature.qualifiers.get('note', []):
            if note.startswith('smCOG:') and '(' in note:
                text = note[6:].split('(', 1)[0]
                smcog, desc = text.split(':', 1)
                desc = desc.replace('_', ' ')
                replacements['smcog'] = '%s (%s)' % (smcog, desc)
            elif note.startswith('smCOG tree PNG image:'):
                entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>'
                url = note.split(':')[-1]
                replacements['smcog_tree_line'] = entry % url

    if type_ == 'transport':
        url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \
              "program=blastp;database=pub/transporter.pep;" \
              "sequence=sequence%%0A%s" % sequence
        transport_blast_line = '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url
        replacements['transport_blast_line'] = transport_blast_line

    if options.searchgtr_links.has_key(record.id + "_" +
                                       utils.get_gene_id(feature)):
        url = options.searchgtr_links[record.id + "_" +
                                      utils.get_gene_id(feature)]
        searchgtr_line = '<a href="%s" target="_new">SEARCHGTr on this gene<br>' % url
        replacements['searchgtr_line'] = searchgtr_line
    replacements['asf'] = get_ASF_predictions(feature)
    if replacements['asf'] == "":
        del replacements['asf']

    return template % replacements