示例#1
0
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record,
                                 options):
    calnames = []
    calseqs = []
    #Predict PKS CAL domain specificities with Minowa et al. method
    logging.info(
        "Predicting CAL domain substrate specificities by Minowa et al. method"
    )
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "CAL_domain":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_CAL" + str(nr)
                calnames.append(name)
                calseqs.append(seq)
    if len(calnames) > 0:
        utils.writefasta(
            calnames, calseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_calseqs.fasta")
        with TemporaryDirectory(change=True):
            minowa_CAL.run_minowa_cal(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_calseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_minowa_calpredoutput.txt")
    return calnames, calseqs
示例#2
0
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict,
                                       seq_record, options):
    #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
    krnames = []
    krseqs = []
    logging.info("Predicting PKS KR activity and stereochemistry using KR " \
        "fingerprints from Starcevic et al.")
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_KR":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_KR" + str(nr)
                krnames.append(name)
                krseqs.append(seq)
    if len(krnames) > 0:
        utils.writefasta(
            krnames, krseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_krseqs.fasta")
        with TemporaryDirectory(change=True):
            kr_analysis.run_kr_analysis(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krpredoutput.txt")
    return krnames, krseqs
示例#3
0
def run_minowa_predictor_pks_at(pksnames, pksseqs, options):
    #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
    utils.writefasta(
        pksnames, pksseqs, options.raw_predictions_outputfolder + os.sep +
        "ctg" + str(options.record_idx) + "_pksseqs.fasta")
    #Run PKS signature analysis
    logging.info(
        "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
    )
    with TemporaryDirectory(change=True):
        PKS_analysis.run_pkssignature_analysis(
            path.join(options.raw_predictions_outputfolder,
                      "ctg" + str(options.record_idx) + "_pksseqs.fasta"),
            path.join(options.raw_predictions_outputfolder,
                      "ctg" + str(options.record_idx) + "_pkssignatures.txt"))

    #Minowa method: run Minowa_AT
    logging.info(
        "Predicting PKS AT domain substrate specificities by Minowa et al. method"
    )
    with TemporaryDirectory(change=True):
        minowa_AT.run_minowa_at(
            options.raw_predictions_outputfolder + os.sep + "ctg" +
            str(options.record_idx) + "_pksseqs.fasta",
            options.raw_predictions_outputfolder + os.sep + "ctg" +
            str(options.record_idx) + "_minowa_pkspredoutput.txt")
示例#4
0
def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene,
                      feature_by_id):
    #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ctermintresdict = {}
    ctermnames = []
    ctermseqs = []
    cterm_file = os.path.join(da_dir, 'cterm.fasta')
    for k in clusterpksgenes:
        if k != endinggene:
            ctermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ctermseqs.append(seq[-100:])
    ctermfasta = "input.fasta"
    z = 0
    for k in ctermnames:
        utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref",
                                       ctermnames[z])
        ctermintresdict[ctermnames[z]] = intresidues
        z += 1
    return ctermintresdict
示例#5
0
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene,
                      feature_by_id):
    #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues
    ntermintresdict = {}
    ntermnames = []
    ntermseqs = []
    nterm_file = os.path.join(da_dir, 'nterm.fasta')
    for k in clusterpksgenes:
        if k != startergene:
            ntermnames.append(k)
            seq = str(utils.get_aa_sequence(feature_by_id[k]))
            ntermseqs.append(seq[:50])
    ntermfasta = "input.fasta"
    z = 0
    for k in ntermnames:
        utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta)
        utils.execute([
            "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2",
            "input.fasta", "-out", "muscle.fasta"
        ])
        intresidues = extractpositions("muscle.fasta", [2, 15],
                                       "EryAIII_5_6_ref", ntermnames[z])
        ntermintresdict[ntermnames[z]] = intresidues
        z += 1
    return ntermintresdict
示例#6
0
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND gene cluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"):
                logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast"  + os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record)
                utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta")
                if options.taxon == "plants":
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options)
                else:
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options)
                if retcode != 0:
                    logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)
                out, err, retcode = convert_to_tabular(tempdir)
                if retcode != 0:
                    logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)

                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()

                write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput)
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 10
                minpercidentity = 30
                blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object
                clusterblastStorage = utils.Storage()
                clusterblastStorage.clusternumber = clusternumber
                clusterblastStorage.queryclusterprots = queryclusterprots
                clusterblastStorage.clusters = clusters
                clusterblastStorage.hitclusterdata = hitclusterdata
                clusterblastStorage.rankedclusters = rankedclusters
                clusterblastStorage.rankedclustervalues = rankedclustervalues
                clusterblastStorage.proteintags = proteintags
                clusterblastStorage.proteinlocations = proteinlocations
                clusterblastStorage.proteinannotations = proteinannotations
                clusterblastStorage.proteinstrands = proteinstrands


                #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands)
                write_clusterblast_output(options, seq_record, clusterblastStorage)
示例#7
0
def write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs):
    equalpartsizes = int(len(queryclusternames) / options.cpus)
    for i in range(options.cpus):
        if i == 0:
            setnames = queryclusternames[:equalpartsizes]
            setseqs = queryclusterseqs[:equalpartsizes]
        elif i == (options.cpus - 1):
            setnames = queryclusternames[(i*equalpartsizes):]
            setseqs = queryclusterseqs[(i*equalpartsizes):]
        else:
            setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)]
            setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)]
        utils.writefasta(setnames, setseqs, "input" + str(i) + ".fasta")
示例#8
0
def internal_homology_blast(seq_record):
    options = config.get_config()
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.info("Finding internal homologs in each gene cluster..")
        internalhomologygroupsdict = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30)
            internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber)
    return internalhomologygroupsdict
示例#9
0
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options):
    #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
    with TemporaryDirectory(change=True):
        nrpsseqs_file = "nrpsseqs.fasta"
        NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2")
        utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file)
        #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
        nrpscodepred.run_nrpscodepred(options)
        #Run NRPSPredictor2 SVM
        datadir = path.join(NRPSPredictor2_dir, 'data')
        libdir = path.join(NRPSPredictor2_dir, 'lib')
        jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar')
        classpath = [
            jarfile,
            '%s/java-getopt-1.0.13.jar' % libdir,
            '%s/Utilities.jar' % libdir,
            '%s/libsvm.jar' % libdir
        ]
        if sys.platform == ("linux2") or sys.platform == ("darwin"):
            java_separator = ":"
        elif sys.platform == ("win32"):
            java_separator = ";"
        commands = [
            'java',
            '-Ddatadir=%s' % datadir, '-cp',
            java_separator.join(classpath),
            'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig',
            '-r',
            path.join(
                options.raw_predictions_outputfolder,
                "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'),
            '-s', '1', '-b', options.eukaryotic and '1' or '0'
        ]
        out, err, retcode = utils.execute(commands)
        if err != '':
            logging.debug('running nrpspredictor2 gave error %r' % err)
        #Copy NRPSPredictor results and move back to original directory
        try:
            os.remove(
                path.join(
                    options.raw_predictions_outputfolder, "ctg" +
                    str(options.record_idx) + "_nrpspredictor2_codes.txt"))
        except:
            pass
        shutil.move(
            "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt",
            options.raw_predictions_outputfolder)
示例#10
0
def smcog_analysis(inputgenes, inputnr, seq_record, smcogdict, smcogsoutputfolder):
    "run smCOG search on all gene cluster CDS features"
    for feature in inputgenes:
        k = utils.get_gene_id(feature)
        tag = k
        seq = str(utils.get_aa_sequence(feature))
        #create input.fasta file with single query sequence to be used as input for MSA
        utils.writefasta([tag], [seq], "input" + str(inputnr) + ".fasta")
        if smcogdict.has_key(k) and len(smcogdict[k]) > 0:
            smcog = (smcogdict[k][0][0]).split(":")[0]
            alignsmcogs(smcog, inputnr)
            #Generate trimmed alignment
            trimalignment(inputnr)
            #Draw phylogenetic tree
            drawtree(inputnr)
            #Convert tree to draw PNG image
            converttree(inputnr, smcogsoutputfolder, tag)
示例#11
0
def internal_homology_blast(seq_record):
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.debug("Finding internal homologs in each gene cluster..")
        internalhomologygroups = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs(
                genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs,
                             "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            queries, _ = blastparse(blastoutput, 25, 30, seq_record)
            groups = find_internal_orthologous_groups(queries,
                                                      iqueryclusternames)
            internalhomologygroups[clusternumber] = groups
    return internalhomologygroups
示例#12
0
def run_minowa_predictor_nrps(pksnrpscoregenes, domaindict, seq_record,
                              options):
    #Minowa method: extract AMP-binding domain, and run Minowa_A
    logging.info("Predicting NRPS A domain substrate specificities by Minowa " \
        "et al. method")
    nrpsnames2, nrpsseqs2 = extract_nrps_genes(pksnrpscoregenes,
                                               domaindict,
                                               seq_record,
                                               extra_aa=0)
    #Make Minowa output folder
    utils.writefasta(
        nrpsnames2, nrpsseqs2,
        path.join(options.raw_predictions_outputfolder,
                  "ctg" + str(options.record_idx) + "_nrpsseqs.fasta"))
    with TemporaryDirectory(change=True):
        minowa_A.run_minowa_a(
            path.join(options.raw_predictions_outputfolder,
                      "ctg" + str(options.record_idx) + "_nrpsseqs.fasta"),
            path.join(
                options.raw_predictions_outputfolder, "ctg" +
                str(options.record_idx) + "_minowa_nrpspredoutput.txt"))
示例#13
0
def run_sandpuma(seq_record, nrpsnames, nrpsseqs, options):
    """Run SANDPUMA on the set of NRPS sequences from this genome"""

    nrpspredictor_output = "ctg" + str(
        options.record_idx) + "_nrpspredictor3_svm.txt"
    individual_predictions = "ctg" + str(options.record_idx) + "_ind.res.tsv"
    percentage_identities = "ctg" + str(options.record_idx) + "_pid.res.tsv"
    sandpuma_predictions = "ctg" + str(options.record_idx) + "_sandpuma.tsv"
    ensemble_predictions = "ctg" + str(options.record_idx) + "_ens.res.tsv"

    # In debug mode, simply copy over previous predictions.
    if options.dbgsandpuma != '':
        shutil.copy(
            path.join(options.dbgsandpuma, nrpspredictor_output),
            path.join(options.raw_predictions_outputfolder,
                      nrpspredictor_output))
        shutil.copy(
            path.join(options.dbgsandpuma, individual_predictions),
            path.join(options.raw_predictions_outputfolder,
                      individual_predictions))
        shutil.copy(
            path.join(options.dbgsandpuma, percentage_identities),
            path.join(options.raw_predictions_outputfolder,
                      percentage_identities))
        shutil.copy(
            path.join(options.dbgsandpuma, sandpuma_predictions),
            path.join(options.raw_predictions_outputfolder,
                      sandpuma_predictions))
        shutil.copy(
            path.join(options.dbgsandpuma, ensemble_predictions),
            path.join(options.raw_predictions_outputfolder,
                      ensemble_predictions))
        return

    #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
    sandpumadir = utils.get_full_path(__file__, "sandpuma")
    with TemporaryDirectory(change=True):
        #Extract A domains from the NRPS sequences and write to FASTA file
        nrpsseqs_file = "input_adomains.fasta"
        utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file)
        #Run SANDPUMA on the FASTA file
        sandpuma_command = [
            sandpumadir + os.sep + 'predictnrps_nodep_par.sh',
            'input_adomains.fasta', sandpumadir,
            str(options.cpus)
        ]
        err = utils.execute(sandpuma_command)[1]
        if err != '':
            logging.error('Running SANDPUMA gave an error')
            raise RuntimeError("Sandpuma failed to run: %s" % err)
        #Copy SANDPUMA (including NRPSPredictor2) results and move back to original directory
        shutil.move(
            "query.rep", options.raw_predictions_outputfolder + os.sep +
            nrpspredictor_output)
        shutil.move(
            "ind.res.tsv", options.raw_predictions_outputfolder + os.sep +
            individual_predictions)
        shutil.move(
            "pid.res.tsv", options.raw_predictions_outputfolder + os.sep +
            percentage_identities)
        shutil.move(
            "sandpuma.tsv", options.raw_predictions_outputfolder + os.sep +
            sandpuma_predictions)
        shutil.move(
            "ens.res.tsv", options.raw_predictions_outputfolder + os.sep +
            ensemble_predictions)
示例#14
0
def trimalignment(inputnr):
    #Trim alignment
    #edit muscle fasta file: remove all positions before the first and after the last position shared by >33% of all sequences
    musclefile = open("muscle" + str(inputnr) + ".fasta", "r")
    filetext = musclefile.read()
    filetext = filetext.replace("\r", "\n")
    lines = filetext.split("\n")
    ##Combine all sequence lines into single lines
    lines2 = []
    seq = ""
    nrlines = len(lines)
    a = 0
    lines = lines[:-1]
    for i in lines:
        if a == (nrlines - 2):
            seq = seq + i
            lines2.append(seq)
        if i[0] == ">":
            lines2.append(seq)
            seq = ""
            lines2.append(i)
        else:
            seq = seq + i
        a += 1
    lines = lines2[1:]
    #Retrieve names and seqs from muscle fasta lines
    seqs = []
    names = []
    for i in lines:
        if len(i) > 0 and i[0] == ">":
            name = i[1:]
            names.append(name)
        else:
            seq = i
            seqs.append(seq)
    #Find first and last amino acids shared conserved >33%
    #Create list system to store conservation of residues
    conservationlist = []
    lenseqs = len(seqs[0])
    nrseqs = len(seqs)
    for i in range(lenseqs):
        conservationlist.append({
            "A": 0,
            "B": 0,
            "C": 0,
            "D": 0,
            "E": 0,
            "F": 0,
            "G": 0,
            "H": 0,
            "I": 0,
            "J": 0,
            "K": 0,
            "L": 0,
            "M": 0,
            "N": 0,
            "P": 0,
            "Q": 0,
            "R": 0,
            "S": 0,
            "T": 0,
            "U": 0,
            "V": 0,
            "W": 0,
            "X": 0,
            "Y": 0,
            "Z": 0,
            "-": 0
        })
    a = 0
    for i in seqs:
        aa = list(i)
        for i in aa:
            conservationlist[a][i] += 1
            a += 1
        a = 0
    firstsharedaa = 0
    lastsharedaa = lenseqs
    #Find first amino acid shared
    first = "yes"
    nr = 0
    for i in conservationlist:
        aa = utils.sortdictkeysbyvaluesrev(i)
        if aa[0] != "-" and i[aa[0]] > (nrseqs / 3) and first == "yes":
            firstsharedaa = nr
            first = "no"
        nr += 1
    #Find last amino acid shared
    conservationlist.reverse()
    first = "yes"
    nr = 0
    for i in conservationlist:
        aa = utils.sortdictkeysbyvaluesrev(i)
        if aa[0] != "-" and i[aa[0]] > (nrseqs / 3) and first == "yes":
            lastsharedaa = lenseqs - nr
            first = "no"
        nr += 1
    #Shorten sequences to detected conserved regions
    seqs2 = []
    for i in seqs:
        seq = i[firstsharedaa:lastsharedaa]
        seqs2.append(seq)
    seqs = seqs2
    seedfastaname = "trimmed_alignment" + str(inputnr) + ".fasta"
    utils.writefasta(names, seqs, seedfastaname)
示例#15
0
def perform_clusterblast(options, seq_record, clusters, proteins):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    debug_path = os.path.abspath(
        os.path.join(options.dbgclusterblast, "clusterblastoutput.txt"))
    with TemporaryDirectory(change=True) as tempdir:
        all_names, all_seqs, all_prots = [], [], []
        prots_by_cluster = []
        for genecluster in geneclusters:
            names, seqs, prots = create_blast_inputs(genecluster, seq_record)
            all_names.extend(names)
            all_seqs.extend(seqs)
            all_prots.extend(prots)
            prots_by_cluster.append(prots)
        if options.dbgclusterblast and os.path.exists(debug_path):
            logging.debug(
                "Skipping DIAMOND calculations, using results from %s instead",
                debug_path)
            with open(debug_path, "r") as fh:
                blastoutput = fh.read()
            logging.debug("    Parsing results from given file...")
        else:
            logging.debug("Running DIAMOND gene cluster search..")
            utils.writefasta(all_names, all_seqs, "input.fasta")
            out, err, retcode = run_diamond(
                "input.fasta",
                path.join(options.clusterblastdir, "geneclusterprots"),
                tempdir, options)
            if retcode != 0:
                logging.error(
                    "Running diamond failed: returned %s, stderr: %r, stdout: %r",
                    retcode, err, out)
            logging.debug("   DIAMOND search finished. Parsing results...")

            with open("input.out", 'r') as fh:
                blastoutput = fh.read()

        write_raw_clusterblastoutput(options.full_outputfolder_path,
                                     blastoutput)

        minseqcoverage = 10
        minpercidentity = 30
        clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage,
                                                   minpercidentity, seq_record)

        clusterblastStorage = utils.Storage()
        clusterblastStorage.clusters = clusters
        clusterblastStorage.proteins = proteins

        for genecluster, queryclusterprots in zip(geneclusters,
                                                  prots_by_cluster):
            clusternumber = utils.get_cluster_number(genecluster)
            cluster_names_to_queries = clusters_by_number.get(
                clusternumber, {})
            allcoregenes = [
                utils.get_gene_acc(cds)
                for cds in utils.get_secmet_cds_features(seq_record)
            ]
            ranking = score_clusterblast_output(clusters, allcoregenes,
                                                cluster_names_to_queries)

            # store all clusterblast related data in a utils.Storage object
            clusterblastStorage.clusternumber = clusternumber
            clusterblastStorage.queryclusterprots = queryclusterprots
            clusterblastStorage.ranking = ranking

            write_clusterblast_output(options, seq_record, clusterblastStorage)
示例#16
0
def perform_knownclusterblast(options, seq_record, clusters, proteins):
    # Run BLAST on gene cluster proteins of each cluster and parse output
    logging.debug("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)

    all_names, all_seqs, all_prots = [], [], []
    prots_by_cluster = []
    for genecluster in geneclusters:
        names, seqs, prots = clusterblast.create_blast_inputs(
            genecluster, seq_record)
        all_names.extend(names)
        all_seqs.extend(seqs)
        all_prots.extend(prots)
        prots_by_cluster.append(prots)

    debug_path = os.path.join(options.dbgclusterblast,
                              "knownclusterblastoutput.txt")
    if options.dbgclusterblast and os.path.exists(debug_path):
        logging.debug("Skipping DIAMOND calculations, using previous results")
        with open(debug_path, "r") as fh:
            blastoutput = fh.read()
    else:
        with TemporaryDirectory(change=True) as tempdir:
            utils.writefasta(
                [qcname.replace(" ", "_") for qcname in all_names], all_seqs,
                "input.fasta")
            out, err, retcode = clusterblast.run_diamond(
                "input.fasta",
                os.path.join(options.knownclusterblastdir,
                             'knownclusterprots'), tempdir, options)
            if retcode != 0:
                logging.debug("out: %r, err: %r, retcode: %s", out, err,
                              retcode)
            with open("input.out", 'r') as fh:
                blastoutput = fh.read()
            clusterblast.write_raw_clusterblastoutput(
                options.full_outputfolder_path,
                blastoutput,
                searchtype="knownclusters")

    minseqcoverage = 40
    minpercidentity = 45
    clusters_by_number, _ = clusterblast.parse_all_clusters(
        blastoutput, minseqcoverage, minpercidentity, seq_record)

    knownclusterblastStorage = utils.Storage()
    knownclusterblastStorage.clusters = clusters
    knownclusterblastStorage.proteins = proteins

    for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster):
        clusternumber = utils.get_cluster_number(genecluster)
        cluster_names_to_queries = clusters_by_number.get(clusternumber, {})
        allcoregenes = [
            utils.get_gene_id(cds)
            for cds in utils.get_secmet_cds_features(seq_record)
        ]
        ranking = clusterblast.score_clusterblast_output(
            clusters, allcoregenes, cluster_names_to_queries)

        # store all clusterblast related data in a utils.Storage object and serialize it
        knownclusterblastStorage.clusternumber = clusternumber
        knownclusterblastStorage.queryclusterprots = queryclusterprots
        knownclusterblastStorage.ranking = ranking
        clusterblast.write_clusterblast_output(options,
                                               seq_record,
                                               knownclusterblastStorage,
                                               searchtype="knownclusters")

    mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters,
                           options)
示例#17
0
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations,
                              proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "knwonclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "knownclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                utils.writefasta(
                    [qcname.replace(" ", "_") for qcname in queryclusternames],
                    queryclusterseqs, "input.fasta")
                out, err, retcode = run_diamond(
                    "input.fasta",
                    path.join(options.knownclusterblastdir,
                              'knownclusterprots'), tempdir, options)
                if retcode != 0:
                    logging.debug("out: %r, err: %r, retcode: %s", out, err,
                                  retcode)
                convert_to_tabular(tempdir)
                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="knownclusters")
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                knownclusterblastStorage = utils.Storage()
                knownclusterblastStorage.clusternumber = clusternumber
                knownclusterblastStorage.queryclusterprots = queryclusterprots
                knownclusterblastStorage.clusters = clusters
                knownclusterblastStorage.hitclusterdata = hitclusterdata
                knownclusterblastStorage.rankedclusters = rankedclusters
                knownclusterblastStorage.rankedclustervalues = rankedclustervalues
                knownclusterblastStorage.proteintags = proteintags
                knownclusterblastStorage.proteinlocations = proteinlocations
                knownclusterblastStorage.proteinannotations = proteinannotations
                knownclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          knownclusterblastStorage,
                                          searchtype="knownclusters")