示例#1
0
    def __init__(self, seq_record, options):
        "Initialize ASF object"

        # Set options
        if 'activeSiteFinderConf' not in options:
            options.activeSiteFinderConf = path.join(utils.get_full_path(__file__, ''), "config", "SignatureResources.xml")

        if 'activeSiteFinderHMMDir' not in options:
            options.activeSiteFinderHMMDir = path.join(utils.get_full_path(__file__, ''), "hmm")

        # Assign variables
        try:
            XMLtree = ET.parse(options.activeSiteFinderConf)
        except ET.ParseError:
            logging.exception("Could not load/parse ActiveSiteFinder configuration file %s.", options.activeSiteFinderConf)
            sys.exit(1)

        XMLroot = XMLtree.getroot()

        HmmProfilesFilenameObj = XMLroot.findall(".//Execute/database")

        self.seq_record = seq_record
        self.options = options
        self.XMLtree = XMLtree
        self.XMLroot = XMLroot
        self.HmmProfilesFilenameObj = HmmProfilesFilenameObj
示例#2
0
def check_prereqs():
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if utils.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate executable for %r" %
                                    binary_name)

    hmm_files = []
    # Check if hmmdetails.txt is readable and well-formatted
    lineno = 1
    for line in open(utils.get_full_path(__file__, "hmmdetails.txt"), "r"):
        if line.count("\t") != 3:
            failure_messages.append(
                "Failed to use HMM profile from line %s due to misformatting:\n %r"
                % (lineno, line))
            continue
        hmm_files.append(line.split('\t')[3].strip())
        lineno += 1

    #Check if cluster_rules.txt is readable and well-formatted
    lineno = 1
    for line in open(utils.get_full_path(__file__, "cluster_rules.txt"), "r"):
        if line.count("\t") != 3:
            failure_messages.append(
                "Failed to use cluster rules from the line %s due to misformatting:\n %r"
                % (lineno, line))

        lineno += 1

    hmm = utils.get_full_path(__file__, _markov_model)
    if utils.locate_file(hmm) is None:
        # try to generate file from all specified profiles in hmmdetails
        try:
            with open(hmm, 'w') as all_hmms_handle:
                for hmm_file in hmm_files:
                    with open(utils.get_full_path(__file__, hmm_file),
                              'r') as handle:
                        all_hmms_handle.write(handle.read())
        except OSError:
            failure_messages.append('Failed to generate file {!r}'.format(hmm))

    for ext in _binary_extensions:
        binary = "{}{}".format(hmm, ext)
        if utils.locate_file(binary) is None:
            _, err, retcode = utils.run_hmmpress(hmm)
            if retcode != 0:
                failure_messages.append('Failed to hmmpress {!r}: {!r}'.format(
                    hmm, err))
            break

    return failure_messages
示例#3
0
def load_geneclusterproteins(accessiondict, searchtype):
    options = config.get_config()
    if not 'clusterblastdir' in options:
        options.clusterblastdir = path.dirname(
            utils.get_full_path(__file__, ''))
        options.subclusterblastdir = path.join(
            path.dirname(options.clusterblastdir), 'subclusterblast')
        options.knownclusterblastdir = path.join(
            path.dirname(options.clusterblastdir), 'knownclusterblast')
    else:
        options.subclusterblastdir = path.join(
            path.dirname(path.dirname(utils.get_full_path(__file__, ''))),
            'subclusterblast')
        options.knownclusterblastdir = path.join(
            path.dirname(path.dirname(utils.get_full_path(__file__, ''))),
            'knownclusterblast')
    #Load gene cluster database proteins info into memory
    if searchtype == "general":
        logging.debug("ClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.clusterblastdir,
                                      "geneclusterprots.fasta")
    elif searchtype == "subclusters":
        logging.debug("SubClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.subclusterblastdir,
                                      "subclusterprots.fasta")
    elif searchtype == "knownclusters":
        logging.debug("KnownClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.knownclusterblastdir,
                                      "knownclusterprots.fasta")

    proteins = {}

    with open(gclusterprotsfile, 'r') as handle:
        for line in handle:
            line = line.rstrip("\n")
            if not line or line[0] != ">":
                continue
            tabs = line.split("|")
            locustag = tabs[4]
            if accessiondict.has_key(locustag):
                locustag = "h_" + locustag
            location = tabs[2]
            strand = tabs[3]
            annotations = tabs[5]
            name = tabs[6]
            proteins[name] = Protein(name, locustag, location, strand,
                                     annotations)
    return proteins
示例#4
0
def load_geneclusters(searchtype):
    #Load gene cluster database into memory
    options = config.get_config()
    if not 'clusterblastdir' in options:
        options.clusterblastdir = path.dirname(
            utils.get_full_path(__file__, ''))
        options.subclusterblastdir = path.join(
            path.dirname(options.clusterblastdir), 'subclusterblast')
        options.knownclusterblastdir = path.join(
            path.dirname(options.clusterblastdir), 'knownclusterblast')
    else:
        options.subclusterblastdir = path.join(
            path.dirname(path.dirname(utils.get_full_path(__file__, ''))),
            'subclusterblast')
        options.knownclusterblastdir = path.join(
            path.dirname(path.dirname(utils.get_full_path(__file__, ''))),
            'knownclusterblast')

    if searchtype == "general":
        logging.debug(
            "ClusterBlast: Loading gene clusters database into memory...")
        geneclustersfile = path.join(options.clusterblastdir,
                                     "geneclusters.txt")
    elif searchtype == "subclusters":
        logging.debug(
            "SubClusterBlast: Loading gene clusters database into memory...")
        geneclustersfile = path.join(options.subclusterblastdir,
                                     "subclusters.txt")
    elif searchtype == "knownclusters":
        logging.debug(
            "KnownClusterBlast: Loading gene clusters database into memory...")
        geneclustersfile = path.join(options.knownclusterblastdir,
                                     "knownclusters.txt")
    geneclustersfile = open(geneclustersfile, "r")
    filetext = geneclustersfile.read()
    lines = [line for line in filetext.split("\n") if "\t" in line]
    clusters = {}
    for i in lines:
        tabs = i.split("\t")
        accession = tabs[0]
        clusterdescription = tabs[1]
        clusternr = tabs[2]
        clustertype = tabs[3]
        clustername = accession + "_" + clusternr
        clustertags = tabs[4].split(";")
        clusterprots = tabs[5].split(";")
        clusters[clustername] = [
            clusterprots, clusterdescription, clustertype, clustertags
        ]
    return clusters
示例#5
0
def load_geneclusterproteins(accessiondict, searchtype):
    options = config.get_config()
    if not 'clusterblastdir' in options:
        options.clusterblastdir = path.dirname(utils.get_full_path(__file__, ''))
        options.subclusterblastdir = path.join(path.dirname(options.clusterblastdir), 'subclusterblast')
        options.knownclusterblastdir = path.join(path.dirname(options.clusterblastdir), 'knownclusterblast')
    else:
        options.subclusterblastdir = path.join(path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'subclusterblast')
        options.knownclusterblastdir = path.join(path.dirname(path.dirname(utils.get_full_path(__file__, ''))), 'knownclusterblast')
    #Load gene cluster database proteins info into memory
    if searchtype == "general"  and options.taxon == "plants":
        logging.info("ClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.clusterblastdir, "plantgeneclusterprots.fasta")
    elif searchtype == "general":
        logging.info("ClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.clusterblastdir, "geneclusterprots.fasta")
    elif searchtype == "subclusters":
        logging.info("SubClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.subclusterblastdir, "subclusterprots.fasta")
    elif searchtype == "knownclusters":
        logging.info("KnownClusterBlast: Loading gene cluster database proteins into " \
        "memory...")
        gclusterprotsfile = path.join(options.knownclusterblastdir, "knownclusterprots.fasta")
    gclusterprotsfile = open(gclusterprotsfile,"r")
    filetext = gclusterprotsfile.read()
    filetext = filetext.replace("\r","\n")
    lines = filetext.split("\n")
    proteinlocations = {}
    proteinstrands = {}
    proteinannotations = {}
    proteintags = {}
    for i in lines:
        if len(i) > 0 and i[0] == ">":
            tabs = i.split("|")
            protein = tabs[6]
            locustag = tabs[4]
            if accessiondict.has_key(locustag):
                locustag = "h_" + locustag
            proteintags[protein] = locustag
            location = tabs[2]
            proteinlocations[protein] = location
            strand = tabs[3]
            proteinstrands[protein] = strand
            annotation = tabs[5]
            proteinannotations[protein] = annotation
    return proteinlocations, proteinstrands, proteinannotations, proteintags
示例#6
0
    def test_labyrinthopeptin(self):
        "Test lantipeptide prediction for labyrinthopeptin"
        rec = seqio.read(utils.get_full_path(__file__, 'labyrinthopeptin.gbk'))
        self.assertEqual(7, len(rec.features))

        specific_analysis(rec, None)
        self.assertEqual(11, len(rec.features))
示例#7
0
    def test_nisin(self):
        "Test lantipeptide prediction for nisin A"
        rec = seqio.read(utils.get_full_path(__file__, 'nisin.gbk'))
        self.assertEqual(38, len(rec.features))

        specific_analysis(rec, None)
        self.assertEqual(40, len(rec.features))
        prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1),
                                            rec)
        self.assertEqual(1, len(prepeptides))
        prepeptide = prepeptides[0]
        leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec)
        self.assertEqual(1, len(leaders))
        leader = leaders[0]
        # real monoisotopic mass is 3351.51, but we overpredict a Dha
        self.assertAlmostEqual(3333.6, h._get_monoisotopic_mass(prepeptide))
        # real mw is 3354.5, see above
        self.assertAlmostEqual(3336.0, h._get_molecular_weight(prepeptide))
        self.assertEqual([3354.0, 3372.1, 3390.1, 3408.1],
                         h._get_alternative_weights(prepeptide))
        self.assertEqual(5, h._get_number_bridges(prepeptide))
        self.assertEqual("MSTKDFNLDLVSVSKKDSGASPR",
                         h._get_leader_peptide_sequence(leader))
        self.assertEqual("ITSISLCTPGCKTGALMGCNMKTATCHCSIHVSK",
                         h._get_core_peptide_sequence(prepeptide))
        self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
示例#8
0
def perform_docking_domain_analysis(options, clusterpksgenes, genecluster,
                                    seq_record, pksnrpsvars):
    feature_by_id = utils.get_feature_dict(seq_record)
    #log("Predicting PKS gene order by docking domain sequence " \
    #    "analysis", stdout=True)
    startergene, endinggene = find_first_and_last_genes(
        clusterpksgenes, pksnrpsvars.domainnamesdict)
    with TemporaryDirectory(change=True):
        dockinganalysis_dir = utils.get_full_path(__file__, "docking_analysis")
        ntermintresdict = extract_nterminus(dockinganalysis_dir,
                                            clusterpksgenes, seq_record,
                                            startergene, feature_by_id)
        ctermintresdict = extract_cterminus(dockinganalysis_dir,
                                            clusterpksgenes, seq_record,
                                            endinggene, feature_by_id)
    possible_orders = find_possible_orders(clusterpksgenes, startergene,
                                           endinggene)
    geneorders, possible_orders_scoredict = rank_biosynthetic_orders(
        ntermintresdict, ctermintresdict, startergene, endinggene,
        possible_orders)
    write_gene_orders_to_html(options, geneorders, possible_orders_scoredict,
                              genecluster, startergene, endinggene)
    #log("Predicting PKS gene order by docking domain sequence " \
    #    "analysis succeeded.", stdout=True)
    #Write html outfile with docking domain analysis output
    pksnrpsvars.dockingdomainanalysis.append(genecluster)
    return geneorders[0]
示例#9
0
    def test_microbisporicin(self):
        "Test lantipeptide prediction for microbisporicin"
        rec = seqio.read(utils.get_full_path(__file__, 'microbisporicin.gbk'))
        self.assertEqual(56, len(rec.features))

        specific_analysis(rec, None)
        self.assertEqual(58, len(rec.features))
        prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1),
                                            rec)
        self.assertEqual(1, len(prepeptides))
        prepeptide = prepeptides[0]
        leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec)
        self.assertEqual(1, len(leaders))
        leader = leaders[0]
        # NOTE: this is not the correct weight for microbisporicin
        # there are some additional modifications we do not predict yet
        self.assertAlmostEqual(2212.9, h._get_monoisotopic_mass(prepeptide))
        self.assertAlmostEqual(2214.5, h._get_molecular_weight(prepeptide))
        self.assertEqual(4, h._get_number_bridges(prepeptide))
        self.assertEqual("MPADILETRTSETEDLLDLDLSIGVEEITAGPA",
                         h._get_leader_peptide_sequence(leader))
        self.assertEqual("VTSWSLCTPGCTSPGGGSNCSFCC",
                         h._get_core_peptide_sequence(prepeptide))
        self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
        self.assertEqual(['AviCys', 'Cl', 'OH'],
                         h._get_core_peptide_extra_modifications(prepeptide))
示例#10
0
def check_prereqs(options):
    "Check if all required applications are around"
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if utils.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for hmm in _markov_models:
        hmm = utils.get_full_path(__file__, hmm)
        if utils.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in _binary_extensions:
            binary = "%s%s" % (hmm, ext)
            if utils.locate_file(binary) is None:
                command = ['hmmpress', hmm]
                try:
                    out, err, retcode = utils.execute(command)
                except OSError as e:
                    retcode = 1
                    err = str(e)
                if retcode != 0:
                    failure_messages.append("Failed to hmmpress %r: %r" % (hmm, err))
                break


    return failure_messages
示例#11
0
    def test_epicidin(self):
        "Test lantipeptide prediction for epicidin 280"
        rec = seqio.read(utils.get_full_path(__file__, 'epicidin_280.gbk'))
        self.assertEqual(21, len(rec.features))

        specific_analysis(rec, None)
        self.assertEqual(23, len(rec.features))
        prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1),
                                            rec)
        self.assertEqual(1, len(prepeptides))
        prepeptide = prepeptides[0]
        leaders = h._find_leader_peptides(utils.get_cluster_by_nr(rec, 1), rec)
        self.assertEqual(1, len(leaders))
        leader = leaders[0]
        self.assertAlmostEqual(3115.7, h._get_monoisotopic_mass(prepeptide))
        self.assertAlmostEqual(3117.7, h._get_molecular_weight(prepeptide))
        self.assertEqual([3135.7, 3153.7, 3171.7],
                         h._get_alternative_weights(prepeptide))
        self.assertEqual(3, h._get_number_bridges(prepeptide))
        self.assertEqual("MENKKDLFDLEIKKDNMENNNELEAQ",
                         h._get_leader_peptide_sequence(leader))
        self.assertEqual("SLGPAIKATRQVCPKATRFVTVSCKKSDCQ",
                         h._get_core_peptide_sequence(prepeptide))
        self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
        self.assertEqual(['Lac'],
                         h._get_core_peptide_extra_modifications(prepeptide))
示例#12
0
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options):
    #Create directory to store structures
    options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures"))
    if not os.path.exists(options.structuresfolder):
        os.mkdir(options.structuresfolder)
    originaldir = os.getcwd()
    structure_drawing_dir = utils.get_full_path(__file__, '') + os.sep + "NRPeditor"
    os.chdir(structure_drawing_dir)
    #Combine predictions into a prediction of the final chemical structure and generate images
    geneclusters = utils.get_cluster_features(seq_record)
    for genecluster in geneclusters:
        smiles_string = "N/A"
        geneclusternr = utils.get_cluster_number(genecluster)
        if pksnrpsvars.compound_pred_dict.has_key(geneclusternr):
            # if product is ectoine generate predefined SMILE string and generate structure
            if pksnrpsvars.compound_pred_dict[geneclusternr] == "ectoine":
                smiles_string = "CC1=NCCC(N1)C(=O)O"
                smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w")
                smilesfile.write(smiles_string)
                smilesfile.close()
                depictstatus = depict_smile(geneclusternr,options.structuresfolder)
                if depictstatus == "failed":
                    pksnrpsvars.failedstructures.append(geneclusternr)
                elif genecluster in pksnrpsvars.failedstructures:
                    del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)]
            else:
                # use information on peptide / polyketide sequence to gernerate structure image
                residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ")
                nrresidues = len(residues.split(" "))
                if nrresidues > 1:
                    if sys.platform == ('win32') or sys.platform == ('darwin'):
                        structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
                    elif sys.platform == ('linux2'):
                        structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
                    for i in [res for res in residues.split(" ") if len(res) > 1]:
                        structcommand = structcommand + i + " "
                    structcommand = structcommand + 'TE"'
                    smilesinfo = os.popen(structcommand)
                    smilesinfo = smilesinfo.read()
                    smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
                    if sys.platform == ('linux2') or sys.platform == ('darwin'):
                        smiles_string.replace("[X]","[*:X]")
                        smiles_string2 = ""
                        a = 1
                        for k in smiles_string:
                            if k == "X":
                                smiles_string2 = smiles_string2 + str(a)
                                a += 1
                            else:
                                smiles_string2 = smiles_string2 + k
                        smiles_string = smiles_string2
                    smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w")
                    smilesfile.write(smiles_string)
                    smilesfile.close()
                    depictstatus = depict_smile(geneclusternr, options.structuresfolder)
                    if depictstatus == "failed":
                        pksnrpsvars.failedstructures.append(geneclusternr)
        _update_sec_met_entry(genecluster, smiles_string)
    os.chdir(originaldir)
示例#13
0
def alignsmcogs(smcog, inputnr):
     #Align to multiple sequence alignment, output as fasta file
     infile1 = utils.get_full_path(__file__, "%s_muscle.fasta" % str(smcog).lower())
     if sys.platform == ('linux2') or sys.platform == ('win32'):
         musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"]
     elif sys.platform == ('darwin'):
         musclecommand = ["muscle", "-quiet", "-profile", "-in1", infile1, "-in2", "input" + str(inputnr) + ".fasta", "-out", "muscle" + str(inputnr) + ".fasta"]
     utils.execute(musclecommand)
示例#14
0
def converttree(inputnr, smcogsoutputfolder, tag):
     #Convert tree to XTG and draw PNG image using TreeGraph
     command = ['java', '-Djava.awt.headless=true', '-jar', utils.get_full_path(__file__, 'TreeGraph.jar'),
                '-convert', 'tree%s.nwk'% inputnr, '-xtg', 'tree%s.xtg' % inputnr ]
     p = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
     processes_starttime = time.time()
     while True:
         if (time.time() - processes_starttime) > 1200:
             if sys.platform == ('linux2') or sys.platform == ('darwin'):
                 os.kill(p.pid,signal.SIGKILL)
                 logging.info("Now in " + os.getcwd() + " TreeGraph -convert on tree" + str(inputnr) + " ran out out of time")
                 break
             elif sys.platform == ('win32'):
                 subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
                 logging.info("Now in " + os.getcwd() + " TreeGraph -convert on tree " + str(inputnr) + " ran out out of time")
                 break
         if p.poll() == 0:
             break
         time.sleep(2)
     out, err = p.communicate()
     output = out
     if "exception" not in output and "Exception" not in output:
         command = ['java', '-Djava.awt.headless=true', '-jar', utils.get_full_path(__file__, 'TreeGraph.jar'),
                    '-image', 'tree%s.xtg'% inputnr, "%s.png" % tag.split('.')[0] ]
         p = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
         processes_starttime = time.time()
         while True:
             if (time.time() - processes_starttime) > 1200:
                 if sys.platform == ('linux2') or sys.platform == ('darwin'):
                     os.kill(p.pid,signal.SIGKILL)
                     logging.info("Now in " + os.getcwd() + " TreeGraph -image on tree " + str(inputnr) + " ran out out of time")
                     break
                 elif sys.platform == ('win32'):
                     subprocess.Popen("taskkill /F /T /PID %i"%p.pid , shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
                     logging.info("Now in " + os.getcwd() + " TreeGraph -image on tree " + str(inputnr) + " ran out out of time")
                     break
             if p.poll() == 0:
                 break
             time.sleep(2)
         out, err = p.communicate()
         output = out
         if "exception" not in output and "Exception" not in output:
             shutil.copy(tag.split(".")[0] + '.png', smcogsoutputfolder)
             os.remove(tag.split(".")[0] + ".png")
             os.remove("tree" + str(inputnr) + ".xtg")
             os.remove("trimmed_alignment" + str(inputnr) + ".fasta")
示例#15
0
def get_supported_cluster_types():
    "Get a list of all supported cluster types"
    clustertypes = [
        line.split("\t")[0] for line in open(
            utils.get_full_path(__file__, 'cluster_rules.txt'), "r")
    ]
    # skip first line containing the header
    return clustertypes[1:]
示例#16
0
def load_searchgtr_search_form_template():
    #Create folder for SEARCHGTR HTML files, load search form template
    searchgtrformtemplate = open(
        path.join(utils.get_full_path(__file__, ''), "searchgtr_form.html"),
        "r")
    searchgtrformtemplate = searchgtrformtemplate.read()
    searchgtrformtemplate = searchgtrformtemplate.replace("\r", "\n")
    searchgtrformtemplateparts = searchgtrformtemplate.split("FASTASEQUENCE")
    return searchgtrformtemplateparts
示例#17
0
 def setUp(self):
     self.config = Namespace()
     config.set_config(self.config)
     self.config.gff3 = utils.get_full_path(__file__, "test_gff.gff")
     self.config.single_entries = False
     contig1 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)]))
     contig1.id = "CONTIG_1"
     contig2 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)]))
     contig2.id = "CONTIG_2"
     self.sequences = [contig1, contig2]
示例#18
0
    def test_sco_cluster3(self):
        "Test lantipeptide prediction for SCO cluster #3"
        rec = seqio.read(utils.get_full_path(__file__, 'sco_cluster3.gbk'))
        self.assertEqual(69, len(rec.features))

        specific_analysis(rec, None)
        self.assertEqual(71, len(rec.features))
        prepeptides = h._find_core_peptides(utils.get_cluster_by_nr(rec, 1),
                                            rec)
        self.assertEqual(1, len(prepeptides))
        prepeptide = prepeptides[0]
        self.assertEqual('Class I', h._get_core_peptide_class(prepeptide))
示例#19
0
def run_nrpspks_specific_hmmer(seq_record, withinclustergenes, pksnrpsvars):
    nrpspksfasta = utils.get_specific_multifasta(withinclustergenes)
    #Analyse for abMotifs
    abmotif_opts = ["-E", "0.25"]
    abmotif_results = utils.run_hmmscan(
        utils.get_full_path(__file__, "abmotifs.hmm"), nrpspksfasta,
        abmotif_opts)
    mhmmlengthsdict = utils.hmmlengths(
        utils.get_full_path(__file__, "abmotifs.hmm"))
    pksnrpsvars.motifdict = parse_hmmscan_results(abmotif_results,
                                                  mhmmlengthsdict)
    #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains
    nrpspksdomain_opts = ["--cut_tc"]
    nrpspksdomain_results = utils.run_hmmscan(
        utils.get_full_path(__file__, "nrpspksdomains.hmm"), nrpspksfasta,
        nrpspksdomain_opts)
    hmmlengthsdict = utils.hmmlengths(
        utils.get_full_path(__file__, "nrpspksdomains.hmm"))
    pksnrpsvars.domaindict = parse_hmmscan_results(nrpspksdomain_results,
                                                   hmmlengthsdict)
    filter_nonterminal_docking_domains(seq_record, pksnrpsvars)
    #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types
    kshmmlengthsdict = utils.hmmlengths(
        utils.get_full_path(__file__, "ksdomains.hmm"))
    ksdomain_results = utils.run_hmmscan(
        utils.get_full_path(__file__, "ksdomains.hmm"), nrpspksfasta,
        nrpspksdomain_opts)
    pksnrpsvars.ksdomaindict = parse_hmmscan_results(ksdomain_results,
                                                     kshmmlengthsdict)
示例#20
0
def filter_results(results, results_by_id):
    #Filter results by comparing scores of different models (for PKS systems)
    for line in open(utils.get_full_path(__file__, "filterhmmdetails.txt"),
                     "r").read().split("\n"):
        filterhmms = line.split(",")
        for cds in results_by_id.keys():
            cdsresults = results_by_id[cds]
            hmmhits = [hit.query_id for hit in cdsresults]
            #Check if multiple competing HMM hits are present
            competing_hits = set(hmmhits) & set(filterhmms)
            if len(competing_hits) > 1:
                #Identify overlapping hits
                overlapping_groups = []
                for hit in cdsresults:
                    for otherhit in [
                            cdsresult for cdsresult in cdsresults
                            if hit != cdsresult
                    ]:
                        overlap = len(
                            set(range(hit.hit_start, hit.hit_end))
                            & set(range(otherhit.hit_start, otherhit.hit_end)))
                        if overlap > 20:
                            added = "n"
                            for group in overlapping_groups:
                                if hit in group and otherhit in group:
                                    added = "y"
                                    break
                                elif hit in group and otherhit not in group:
                                    group.append(otherhit)
                                    added = "y"
                                    break
                                elif hit not in group and otherhit in group:
                                    group.append(hit)
                                    added = "y"
                                    break
                            if added == "n":
                                overlapping_groups.append([hit, otherhit])
                #Remove worst-scoring of overlapping hits
                for group in overlapping_groups:
                    highestscore = max([hit.bitscore for hit in group])
                    hit_with_highestscore = group[[
                        hit.bitscore for hit in group
                    ].index(highestscore)]
                    to_delete = [
                        hit for hit in group if hit != hit_with_highestscore
                    ]
                    for res in [res for res in results]:
                        if res in to_delete:
                            del results[results.index(res)]
                            del results_by_id[cds][results_by_id[cds].index(
                                res)]
    return results, results_by_id
示例#21
0
def run(seq_record, options):
    "run hmmsearch against PFAM for all CDS features"
    if 'pfamdir' not in options:
        options.pfamdir = utils.get_full_path(__file__, '')

    query_sequence = utils.get_multifasta(seq_record)

    target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm')

    logging.info('Running whole-genome pfam search')
    results = utils.run_hmmscan(target_hmmfile, query_sequence)

    _annotate(seq_record, options, results)
示例#22
0
def load_id_lines():
    sandpuma_dir = utils.get_full_path(__file__, 'sandpuma')
    fasta_file = path.join(sandpuma_dir, 'flat', 'fullset0_smiles.faa')

    id_lines = []

    with open(fasta_file, 'r') as fh:
        for line in fh:
            if not line.startswith(">"):
                continue

            id_lines.append(line.strip().lstrip(">"))

    return id_lines
示例#23
0
def get_supported_cluster_types():
    "Get a list of all supported cluster types"
    clustertypes = [
        line.split("\t")[0] for line in open(
            utils.get_full_path(__file__, 'cluster_rules.txt'), "r")
    ][1:]
    for fname in listdir(path.dirname(path.abspath(__file__))):
        dir_path = path.join(path.dirname(path.abspath(__file__)), fname)
        if path.isdir(dir_path):
            clustertypes.extend([
                (fname + "/" + line.split("\t")[0])
                for line in open(path.join(dir_path, "cluster_rules.txt"), "r")
            ][1:])
    return clustertypes
示例#24
0
def filter_result_overlapping_genes(results, results_by_id, overlaps,
                                    feature_by_id):
    # filter results of overlapping genes (only gene with the best score can retain its result)
    filterhmm_list = []
    overlap_id_with_result = {}
    for line in open(utils.get_full_path(__file__, "filterhmmdetails.txt"),
                     "r").read().split("\n"):
        filterhmms = line.split(",")
        if filterhmms not in filterhmm_list:
            filterhmm_list.append(filterhmms)
    for cds in results_by_id.keys():
        if overlaps[1][cds] not in overlap_id_with_result.keys():
            overlap_id_with_result[overlaps[1][cds]] = [cds]
        elif cds not in overlap_id_with_result[overlaps[1][cds]]:
            overlap_id_with_result[overlaps[1][cds]].append(cds)
    for overlap_id in overlap_id_with_result.keys():
        best_hit_scores = {}
        for cds in overlap_id_with_result[overlap_id]:
            for hit in results_by_id[cds]:
                feature = feature_by_id[hit.hit_id]
                if (hit.query_id not in best_hit_scores) or (
                        best_hit_scores[hit.query_id] <
                        abs(feature.location.end - feature.location.start)):
                    best_hit_scores[hit.query_id] = abs(feature.location.end -
                                                        feature.location.start)
        for cds in overlap_id_with_result[overlap_id]:
            to_delete = []
            for hit in results_by_id[cds]:
                feature = feature_by_id[hit.hit_id]
                if (abs(feature.location.end - feature.location.start) <
                        best_hit_scores[hit.query_id]):
                    to_delete.append(hit)
                else:  # filter for filterhmmdetails.txt
                    for filterhmms in filterhmm_list:
                        if hit.query_id not in filterhmms:
                            continue
                        for similar_hit in filterhmms:
                            if similar_hit not in best_hit_scores.keys():
                                continue
                            if (abs(feature.location.end -
                                    feature.location.start) <
                                    best_hit_scores[similar_hit]):
                                to_delete.append(hit)
                                break
            for hit in to_delete:
                del results[results.index(hit)]
                del results_by_id[cds][results_by_id[cds].index(hit)]
                if len(results_by_id[cds]) < 1:
                    del results_by_id[cds]
    return results, results_by_id
示例#25
0
def check_prereqs(options):
    "Check if all required applications are around"
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if utils.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name, optional in _required_files:
        if utils.locate_file(
                path.join(utils.get_full_path(__file__, ''),
                          file_name)) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % file_name)

    return failure_messages
示例#26
0
文件: js.py 项目: chevrm/transPACT
def load_cog_annotations():
    "Load the smCOG type annotations from a file"
    type_keys = {
        'B': 'biosynthetic-additional',
        'T': 'transport',
        'R': 'regulatory',
        'O': 'other'
    }
    annotations = {}
    for line in open(utils.get_full_path(__file__, 'cog_annotations.txt'),
                     'r'):
        line = line.strip()
        cog, _, type_ = line.split('\t', 3)
        annotations[cog] = type_keys.get(type_, 'other')

    return annotations
示例#27
0
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options):
    #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor
    with TemporaryDirectory(change=True):
        nrpsseqs_file = "nrpsseqs.fasta"
        NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2")
        utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file)
        #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs
        nrpscodepred.run_nrpscodepred(options)
        #Run NRPSPredictor2 SVM
        datadir = path.join(NRPSPredictor2_dir, 'data')
        libdir = path.join(NRPSPredictor2_dir, 'lib')
        jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar')
        classpath = [
            jarfile,
            '%s/java-getopt-1.0.13.jar' % libdir,
            '%s/Utilities.jar' % libdir,
            '%s/libsvm.jar' % libdir
        ]
        if sys.platform == ("linux2") or sys.platform == ("darwin"):
            java_separator = ":"
        elif sys.platform == ("win32"):
            java_separator = ";"
        commands = [
            'java',
            '-Ddatadir=%s' % datadir, '-cp',
            java_separator.join(classpath),
            'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig',
            '-r',
            path.join(
                options.raw_predictions_outputfolder,
                "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'),
            '-s', '1', '-b', options.eukaryotic and '1' or '0'
        ]
        out, err, retcode = utils.execute(commands)
        if err != '':
            logging.debug('running nrpspredictor2 gave error %r' % err)
        #Copy NRPSPredictor results and move back to original directory
        try:
            os.remove(
                path.join(
                    options.raw_predictions_outputfolder, "ctg" +
                    str(options.record_idx) + "_nrpspredictor2_codes.txt"))
        except:
            pass
        shutil.move(
            "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt",
            options.raw_predictions_outputfolder)
示例#28
0
def check_prereqs():
    failure_messages = []
    for binary_name, optional in _required_binaries:
        if utils.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate executable for %r" %
                                    binary_name)

    for hmm in _markov_models:
        hmm = utils.get_full_path(__file__, hmm)
        if utils.locate_file(hmm) is None:
            failure_messages.append("Failed to locate file %r" % hmm)
            continue
        for ext in _binary_extensions:
            binary = "%s%s" % (hmm, ext)
            if utils.locate_file(binary) is None:
                _, err, retcode = utils.run_hmmpress(hmm)
                if retcode != 0:
                    failure_messages.append("Failed to hmmpress %r: %r" %
                                            (hmm, err))
                break
            else:
                binary_mtime = path.getmtime(binary)
                hmm_mtime = path.getmtime(hmm)
                if hmm_mtime > binary_mtime:
                    try:
                        from glob import glob
                        for f in glob("%s.h3?" % hmm):
                            logging.debug("removing outdated file %s", f)
                            os.remove(f)
                    except OSError as e:
                        failure_messages.append("Failed to remove outdated binary file for %s: %s" % \
                            (hmm, e))
                        break
                    _, err, retcode = utils.run_hmmpress(hmm)
                    if retcode != 0:
                        failure_messages.append("Failed to hmmpress %r: %r" %
                                                (hmm, err))
                        import datetime
                        failure_messages.append("HMM binary files outdated. %s (changed: %s) vs %s (changed: %s)" % \
                            (hmm, datetime.datetime.fromtimestamp(hmm_mtime),
                             binary, datetime.datetime.fromtimestamp(binary_mtime)))
                    break

    return failure_messages
示例#29
0
def generate_webpage(seq_records, options):
    d = pq(filename=utils.get_full_path(__file__, 'index.tpl'), parser='html')

    num = count_all_clusters(seq_records)
    set_title(d, seq_records[0].id, num)
    set_colourscheme(d, options)
    set_urls(d, options)
    set_version(d)
    set_download_links(d, seq_records[0].id, options)

    generate_searchgtr_htmls(seq_records, options)
    records = js.convert_records(seq_records, options)

    extra_data = dict(js_domains=[],
                      clusterblast_clusters=[],
                      subclusterblast_clusters=[],
                      knownclusterblast_clusters=[])

    if 'triggered_limit' in options and options.triggered_limit:
        add_truncation_notice(d, options)

    records_written = 0

    for i in range(len(records)):
        odd = True
        records[i]['seq_id'] = utils.ascii_string(records[i]['seq_id'])
        if len(records[i]['clusters']) > 0:
            add_separator(d, records[i]['seq_id'], records[i]['orig_id'],
                          options)
        for cluster in records[i]['clusters']:
            add_cluster(d, cluster, seq_records[i], options, extra_data, odd,
                        seq_records[0].id)
            records_written += 1
            odd = not odd

    if records_written == 0:
        add_no_result_note(d, options)

    write_geneclusters_js(records, options.outputfoldername, extra_data)

    with open(path.join(options.outputfoldername, 'index.html'), 'w') as h:
        h.write('<!doctype html>\n')
        h.write(d.outerHtml())
示例#30
0
def run(seq_record, options):
    "run hmmsearch against PFAM for all CDS features"
    if 'pfamdir' not in options:
        options.pfamdir = utils.get_full_path(__file__, '')

    query_sequence = utils.get_multifasta(seq_record)

    target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm')

    logging.info('Running whole-genome pfam search')

    if options.skip_cleanup:
        results_file = path.join(options.full_outputfolder_path, 'fullhmmer.txt')
        if path.exists(results_file):
            results = list(SearchIO.parse(results_file, 'hmmer3-text'))
        else:
            results = utils.run_hmmscan(target_hmmfile, query_sequence, results_file=results_file)
    else:
        results = utils.run_hmmscan(target_hmmfile, query_sequence)

    _annotate(seq_record, options, results)