Пример #1
0
def run_minowa_predictor_pks_at(pksnames, pksseqs, options):
    #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?)
    utils.writefasta(
        pksnames, pksseqs, options.raw_predictions_outputfolder + os.sep +
        "ctg" + str(options.record_idx) + "_pksseqs.fasta")
    #Run PKS signature analysis
    logging.info(
        "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences"
    )
    with TemporaryDirectory(change=True):
        PKS_analysis.run_pkssignature_analysis(
            path.join(options.raw_predictions_outputfolder,
                      "ctg" + str(options.record_idx) + "_pksseqs.fasta"),
            path.join(options.raw_predictions_outputfolder,
                      "ctg" + str(options.record_idx) + "_pkssignatures.txt"))

    #Minowa method: run Minowa_AT
    logging.info(
        "Predicting PKS AT domain substrate specificities by Minowa et al. method"
    )
    with TemporaryDirectory(change=True):
        minowa_AT.run_minowa_at(
            options.raw_predictions_outputfolder + os.sep + "ctg" +
            str(options.record_idx) + "_pksseqs.fasta",
            options.raw_predictions_outputfolder + os.sep + "ctg" +
            str(options.record_idx) + "_minowa_pkspredoutput.txt")
Пример #2
0
    def test__exit(self):
        "Test TemporaryDirectory __exit__() method"
        tdir = TemporaryDirectory()

        trace = """    Called tempfile.mkdtemp('', 'tmp', None)
    Called shutil.rmtree('/fake/tmp/dir')"""
        tdir.__exit__(None, None, None)
        assert_same_trace(self.tt, trace)
Пример #3
0
    def test__exit(self):
        "Test TemporaryDirectory __exit__() method"
        tdir = TemporaryDirectory()

        trace = """    Called tempfile.mkdtemp('', 'tmp', None)
    Called shutil.rmtree('/fake/tmp/dir')"""
        tdir.__exit__(None, None, None)
        assert_same_trace(self.tt, trace)
Пример #4
0
 def test__enter(self):
     "Test TemporaryDirectory __enter__() method"
     expected = "/fake/tmp/dir"
     trace = """    Called tempfile.mkdtemp('', 'tmp', None)"""
     tdir = TemporaryDirectory()
     d = tdir.__enter__()
     self.assertEqual(d, expected)
     self.assertEqual(self.cwd, '/old/cur/dir')
     assert_same_trace(self.tt, trace)
Пример #5
0
 def test__enter(self):
     "Test TemporaryDirectory __enter__() method"
     expected = "/fake/tmp/dir"
     trace = """    Called tempfile.mkdtemp('', 'tmp', None)"""
     tdir = TemporaryDirectory()
     d = tdir.__enter__()
     self.assertEqual(d, expected)
     self.assertEqual(self.cwd, '/old/cur/dir')
     assert_same_trace(self.tt, trace)
Пример #6
0
 def test_change_cwd(self):
     "Test TemporaryDirectory changing the cwd"
     expected = "/fake/tmp/dir"
     trace = """    Called tempfile.mkdtemp('', 'tmp', None)
 Called os.getcwd()
 Called os.chdir('/fake/tmp/dir')
 Called os.chdir('/old/cur/dir')
 Called shutil.rmtree('/fake/tmp/dir')"""
     tdir = TemporaryDirectory(change=True)
     tdir.__enter__()
     self.assertEqual(self.cwd, expected)
     self.assertEqual(tdir.old_wd, '/old/cur/dir')
     tdir.__exit__(None, None, None)
     self.assertEqual(self.cwd, '/old/cur/dir')
     assert_same_trace(self.tt, trace)
Пример #7
0
def run_diamond(subcommand: str,
                opts: Optional[List[str]] = None) -> RunResult:
    """ Run a diamond subcommand, possibly with further options.

        Arguments:
            subcommand: the diamond subcommand to run
            opts: a list of additional argument strings to pass to diamond

        Returns:
            RunResult of running diamond
    """
    config = get_config()
    with TemporaryDirectory() as temp_dir:
        params = [
            config.cb_diamond_executable,
            subcommand,
            "--threads", str(config.cpus),
            "--tmpdir", temp_dir,
        ]

        if opts:
            params.extend(opts)

        result = execute(params)
        if not result.successful():
            raise RuntimeError("diamond failed to run: %s -> %s" % (subcommand, result.stderr[-100:]))
    return result
Пример #8
0
 def test_minimal(self):
     with TemporaryDirectory(change=True) as tempdir:
         self.options = build_config(["--minimal", "--output-dir", tempdir],
                                     isolated=True, modules=antismash.get_all_modules())
         with patch.object(nrps_pks, "run_on_record", side_effect=RuntimeError("shouldn't run")):
             antismash.main.run_antismash(helpers.get_path_to_balhymicin_genbank(),
                                          self.options)
Пример #9
0
 def test_depict_ectoine(self):
     with TemporaryDirectory(change=True) as temp:
         assert structure_drawer.generate_image(0, "CC1=NCCC(N1)C(=O)O",
                                                temp)
         assert os.path.exists("genecluster0.smi")
         assert os.path.exists("genecluster0.png")
         assert os.path.exists("genecluster0_icon.png")
Пример #10
0
def generate_trees(smcogs_dir: str, hmm_results: Dict[str, List[HSP]],
                   genes_within_clusters: List[CDSFeature],
                   nrpspks_genes: List[CDSFeature]) -> Dict[str, str]:
    """ smCOG phylogenetic tree construction """
    pks_nrps_gene_names = set(
        [feature.get_name() for feature in nrpspks_genes])
    logging.info("Calculating and drawing phylogenetic trees of cluster genes "
                 "with smCOG members")
    with TemporaryDirectory(change=True):
        cds_features = []
        for cds in genes_within_clusters:
            gene_id = cds.get_name()
            if gene_id not in pks_nrps_gene_names and hmm_results.get(gene_id):
                cds_features.append(cds)
        args = []
        for index, cds in enumerate(cds_features):
            smcog = hmm_results[cds.get_name()][0].hit_id.split(":")[0]
            args.append([cds, index, smcog, smcogs_dir])
        subprocessing.parallel_function(smcog_tree_analysis, args)

    files = glob.glob("*.png")
    tree_filenames = {}
    for filename in files:
        tag = filename.rsplit(".png", 1)[0]
        tree_filenames[tag] = filename
    return tree_filenames
Пример #11
0
def perform_docking_domain_analysis(options, clusterpksgenes, genecluster,
                                    seq_record, pksnrpsvars):
    feature_by_id = utils.get_feature_dict(seq_record)
    #log("Predicting PKS gene order by docking domain sequence " \
    #    "analysis", stdout=True)
    startergene, endinggene = find_first_and_last_genes(
        clusterpksgenes, pksnrpsvars.domainnamesdict)
    with TemporaryDirectory(change=True):
        dockinganalysis_dir = utils.get_full_path(__file__, "docking_analysis")
        ntermintresdict = extract_nterminus(dockinganalysis_dir,
                                            clusterpksgenes, seq_record,
                                            startergene, feature_by_id)
        ctermintresdict = extract_cterminus(dockinganalysis_dir,
                                            clusterpksgenes, seq_record,
                                            endinggene, feature_by_id)
    possible_orders = find_possible_orders(clusterpksgenes, startergene,
                                           endinggene)
    geneorders, possible_orders_scoredict = rank_biosynthetic_orders(
        ntermintresdict, ctermintresdict, startergene, endinggene,
        possible_orders)
    write_gene_orders_to_html(options, geneorders, possible_orders_scoredict,
                              genecluster, startergene, endinggene)
    #log("Predicting PKS gene order by docking domain sequence " \
    #    "analysis succeeded.", stdout=True)
    #Write html outfile with docking domain analysis output
    pksnrpsvars.dockingdomainanalysis.append(genecluster)
    return geneorders[0]
Пример #12
0
def check_diamond_db_compatible(database_file: str) -> bool:
    """ Check if the given diamond database is compatible with the installed diamond version.

        Arguments:
            database_file: the path to the database file to check

        Returns:
            True if the database file is compatible, False otherwise
    """

    with TemporaryDirectory(change=True):
        dummy_fasta = "dummy.fa"
        dummy_db = "dummy.dmnd"
        with open(dummy_fasta, "w") as handle:
            handle.write(">test\nM\n")
        run_diamond_makedb(dummy_db, dummy_fasta)
        compatible_format = _extract_db_format(dummy_db)

    try:
        db_format = _extract_db_format(database_file)
    except ValueError:
        return False

    if db_format != compatible_format:
        logging.debug(
            "Incompatible database format for %s. Expected %s but found %s.",
            database_file, compatible_format, db_format)
        return False
    return True
Пример #13
0
def generate_image(cluster_number: int, smiles: str,
                   structures_dir: str) -> bool:
    """ Constructs an image, if possible, of a cluster's product structure """
    filename = "genecluster%d" % cluster_number
    png = filename + ".png"
    smi = filename + ".smi"
    icon = filename + "_icon.png"

    with TemporaryDirectory(change=True):
        with open(smi, "w") as handler:
            handler.write(smiles)

        indigo = Indigo()
        query = indigo.loadMoleculeFromFile(smi)
        renderer = IndigoRenderer(indigo)
        # now that the renderer exists, so does the render-coloring option
        indigo.setOption("render-coloring", True)
        renderer.renderToFile(query, png)

        indigo.setOption("render-image-size", 200, 150)
        renderer.renderToFile(query, icon)

        # was it successful
        dircontents = os.listdir(os.getcwd())
        # an exception should be raised by indigo, but just in case
        if png not in dircontents:
            return False
        # if so, move the files to the output dir
        for filename in [png, icon, smi]:
            shutil.copy(filename, structures_dir)
            os.remove(filename)
    return True
Пример #14
0
    def test_classification_with_colon(self):
        # since SMCOG id and description are stored in a string separated by :,
        # ensure that descriptions containing : are properly handled
        # test gene is AQF52_5530 from CP013129.1
        translation = (
            "MDTHQREEDPVAARRDRTHYLYLAVIGAVLLGIAVGFLAPGVAVELKPLGTGFVN"
            "LIKMMISPIIFCTIVLGVGSVRKAAKVGAVGGLALGYFLVMSTVALAIGLLVGNL"
            "LEPGSGLHLTKEIAEAGAKQAEGGGESTPDFLLGIIPTTFVSAFTEGEVLQTLLV"
            "ALLAGFALQAMGAAGEPVLRGIGHIQRLVFRILGMIMWVAPVGAFGAIAAVVGAT"
            "GAAALKSLAVIMIGFYLTCGLFVFVVLGAVLRLVAGINIWTLLRYLGREFLLILS"
            "TSSSESALPRLIAKMEHLGVSKPVVGITVPTGYSFNLDGTAIYLTMASLFVAEAM"
            "GDPLSIGEQISLLVFMIIASKGAAGVTGAGLATLAGGLQSHRPELVDGVGLIVGI"
            "DRFMSEARALTNFAGNAVATVLVGTWTKEIDKARVTEVLAGNIPFDEKTLVDDHA"
            "PVPVPDQRAEGGEEKARAGV")
        cds = helpers.DummyCDS(0, len(translation))
        cds.translation = translation
        results = smcogs.classify.classify_genes([cds])
        assert results[cds.get_name(
        )][0].hit_id == "SMCOG1212:sodium:dicarboxylate_symporter"
        record = helpers.DummyRecord(seq=translation)
        record.add_cds_feature(cds)
        record.add_cluster(helpers.DummyCluster(0, len(translation)))

        with TemporaryDirectory(change=True):
            results = smcogs.run_on_record(record, None, self.options)
            # if we don't handle multiple semicolons right, this line will crash
            results.add_to_record(record)
            gene_functions = cds.gene_functions.get_by_tool("smcogs")
            assert len(gene_functions) == 1
            assert str(gene_functions[0]).startswith(
                "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter"
                " (Score: 416; E-value: 2.3e-126)")
Пример #15
0
    def test_trees(self):
        with TemporaryDirectory(change=True):
            results = smcogs.run_on_record(self.record, None, self.options)
            assert len(results.tree_images) == 7
            for image in results.tree_images.values():
                assert os.path.exists(
                    os.path.join(results.relative_tree_path, image))

            # test the results function properly
            json = results.to_json()
            assert smcogs.SMCOGResults.from_json(json,
                                                 self.record).to_json() == json
            assert smcogs.regenerate_previous_results(
                json, self.record, self.options).to_json() == json

            for cds in self.record.get_cluster(0).cds_children:
                hit = results.best_hits.get(cds.get_name())
                if hit:
                    assert not cds.notes
                    assert cds.gene_function in [
                        secmet.feature.GeneFunction.OTHER,
                        secmet.feature.GeneFunction.CORE
                    ]
            results.add_to_record(self.record)
            for cds in self.record.get_cluster(0).cds_children:
                if cds.sec_met:
                    continue  # no sense checking, because we don't do anything with it
                hit = results.best_hits.get(cds.get_name())
                if not hit:
                    assert cds.gene_function == secmet.feature.GeneFunction.OTHER
                    continue
                assert cds.get_name() in results.tree_images
                assert len(cds.notes) == 1
                assert cds.gene_function != secmet.feature.GeneFunction.OTHER
Пример #16
0
def run_prodigal(record: Record, options: ConfigType) -> None:
    """ Run progidal to annotate prokaryotic sequences
    """
    if "basedir" in options.get('prodigal', ''):
        basedir = options.prodigal.basedir
    else:
        basedir = ""
    with TemporaryDirectory(change=True):
        name = record.id.lstrip('-')
        if not name:
            name = "unknown"
        fasta_file = '%s.fasta' % name
        result_file = '%s.predict' % name
        with open(fasta_file, 'w') as handle:
            seqio.write([record.to_biopython()], handle, 'fasta')

        # run prodigal
        prodigal = [path.join(basedir, 'prodigal')]
        prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file])
        if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000:
            prodigal.extend(['-p', 'meta'])

        err = execute(prodigal).stderr
        if err.find('Error') > -1:
            logging.error("Failed to run prodigal: %r", err)
            raise RuntimeError("prodigal error: %s" % err)
        found = 0
        for line in open(result_file, 'r'):
            # skip first line
            if not line.startswith('>'):
                continue
            name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip(
            ).split("_")

            try:
                start = int(start_chunk)
                end = int(end_chunk)
                if prodigal_strand == "+":
                    strand = 1
                else:
                    strand = -1
            except ValueError:
                logging.error('Malformatted prodigal output line %r',
                              line.rstrip())
                continue

            if start > end:
                strand = -1
                start, end = end, start

            loc = FeatureLocation(start - 1, end, strand=strand)
            translation = record.get_aa_translation_from_location(loc)
            feature = CDSFeature(loc,
                                 locus_tag='ctg%s_%s' %
                                 (record.record_index, name),
                                 translation=translation,
                                 translation_table=record.transl_table)
            record.add_cds_feature(feature)
            found += 1
    logging.debug("prodigal found %d CDS features", found)
Пример #17
0
    def test_trees(self):
        with TemporaryDirectory(change=True):
            # add the classifications to work with
            genefunctions.smcogs.classify(self.record.id,
                                          self.record.get_cds_features(),
                                          self.options).add_to_record(
                                              self.record)

            results = smcog_trees.run_on_record(self.record, None,
                                                self.options)
            assert len(results.tree_images) == 7
            for image in results.tree_images.values():
                assert os.path.exists(
                    os.path.join(results.relative_tree_path, image))

            # test the results function properly
            json = results.to_json()
            assert smcog_trees.SMCOGTreeResults.from_json(
                json, self.record).to_json() == json
            regenerated = smcog_trees.regenerate_previous_results(
                json, self.record, self.options)
            assert isinstance(regenerated, smcog_trees.SMCOGTreeResults), json
            assert regenerated.to_json() == json

        results.add_to_record(self.record)
        for cds in self.record.get_cds_features():
            if cds.gene_functions.get_by_tool("rule-based-clusters"):
                continue  # no sense checking, because we don't do anything with it
            if not cds.gene_functions.get_by_tool("smcogs"):
                continue
            assert cds.get_name() in results.tree_images
            assert len(cds.notes) == 1
            assert cds.gene_function != secmet.qualifiers.GeneFunction.OTHER
Пример #18
0
 def test_record_to_json_and_back(self):
     filename = get_path_to_nisin_genbank()
     records = list(seqio.parse(open(filename), "genbank"))
     records = [
         Record.from_biopython(rec, taxon="bacteria") for rec in records
     ]
     rec_results = [{}, {}, {}]
     results = serialiser.AntismashResults(filename, records, rec_results,
                                           "dummy")
     json_handle = StringIO()
     results.write_to_file(json_handle)
     json_handle.seek(0)
     new_results = serialiser.AntismashResults.from_file(json_handle,
                                                         taxon="bacteria")
     assert results.to_json() == new_results.to_json()
     # check no records were lost
     assert len(new_results.records) == len(results.records)
     # check that the contents of the records is the same
     #  by converting to biopython and writing to genbanks
     original = self.create_data_stream(results.records)
     new = self.create_data_stream(new_results.records)
     oldvalue = original.getvalue()
     newvalue = new.getvalue()
     with TemporaryDirectory(change=True):
         open("old.json", "w").write(oldvalue)
         open("new.json", "w").write(newvalue)
         for oldline, newline in zip(oldvalue.split('\n'),
                                     newvalue.split('\n')):
             assert oldline == newline
Пример #19
0
def perform_subclusterblast(options: ConfigType, record: Record, clusters: Dict[str, ReferenceCluster],
                            proteins: Dict[str, Protein]) -> GeneralResults:
    """ Run BLAST on gene cluster proteins of each cluster, parse output and
        return result rankings for each cluster

        Arguments:
            options: antismash Config
            record: the Record to analyse
            clusters: a dictionary mapping reference cluster name to ReferenceCluster
            proteins: a dictionary mapping reference protein name to Protein

        Returns:
            a GeneralResults instance storing results for all clusters in the
            record
    """
    results = GeneralResults(record.id, search_type="subclusterblast")
    with TemporaryDirectory(change=True):
        allcoregenes = get_core_gene_ids(record)
        for region in record.get_regions():
            # prepare and run diamond
            write_fastas_with_all_genes([region], "input.fasta",
                                        partitions=options.cpus)
            run_clusterblast_processes(options)
            blastoutput = read_clusterblast_output(options)
            write_raw_clusterblastoutput(options.output_dir, blastoutput, prefix="subclusterblast")
            # parse and score diamond results
            _, cluster_names_to_queries = blastparse(blastoutput, record,
                                                     min_seq_coverage=40,
                                                     min_perc_identity=45)
            ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries)
            logging.debug("Cluster at %s has %d subclusterblast results", region.location, len(ranking))
            # store results
            region_result = RegionResult(region, ranking, proteins, "subclusterblast")
            results.add_region_result(region_result, clusters, proteins)
    return results
Пример #20
0
def internal_homology_blast(record: secmet.Record) -> Dict[int, List[List[str]]]:
    """ Run BLAST on gene cluster proteins of each cluster on itself to find
        internal homologs
        store groups of homologs - including singles - in a dictionary
        as a list of lists accordingly

        Arguments:
            record: the Record to generate groups from

        Returns:
            a dictionary mapping cluster_number to
                a list containing distinct groups represented by
                    lists of query ids
    """
    with TemporaryDirectory(change=True):
        logging.info("Finding internal homologs in each gene cluster...")
        internalhomologygroups = {}
        for cluster in record.get_clusters():
            cluster_number = cluster.get_cluster_number()
            iquerycluster_names, iqueryclusterseqs = create_blast_inputs(cluster)
            query_filename = "internal_input.fasta"
            fasta.write_fasta(iquerycluster_names, iqueryclusterseqs, query_filename)
            blastoutput = run_internal_blastsearch(query_filename)
            queries, _ = blastparse(blastoutput, record, min_seq_coverage=25,
                                    min_perc_identity=30)
            groups = find_internal_orthologous_groups(queries, iquerycluster_names)
            internalhomologygroups[cluster_number] = groups
    return internalhomologygroups
Пример #21
0
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record,
                                 options):
    calnames = []
    calseqs = []
    #Predict PKS CAL domain specificities with Minowa et al. method
    logging.info(
        "Predicting CAL domain substrate specificities by Minowa et al. method"
    )
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "CAL_domain":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_CAL" + str(nr)
                calnames.append(name)
                calseqs.append(seq)
    if len(calnames) > 0:
        utils.writefasta(
            calnames, calseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_calseqs.fasta")
        with TemporaryDirectory(change=True):
            minowa_CAL.run_minowa_cal(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_calseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_minowa_calpredoutput.txt")
    return calnames, calseqs
Пример #22
0
def generate_trees(smcogs_dir: str, genes_within_clusters: List[CDSFeature],
                   nrpspks_genes: List[CDSFeature]) -> Dict[str, str]:
    """ smCOG phylogenetic tree construction """
    pks_nrps_cds_names = set(feature.get_name() for feature in nrpspks_genes)
    logging.info("Calculating and drawing phylogenetic trees of cluster genes "
                 "with smCOG members")
    cds_features = []
    for cds in genes_within_clusters:
        cds_name = cds.get_name()
        if cds_name in pks_nrps_cds_names:
            continue
        if not cds.gene_functions.get_by_tool("smcogs"):
            continue
        cds_features.append(cds)

    with TemporaryDirectory(change=True):
        args = []
        for index, cds in enumerate(cds_features):
            smcog = cds.gene_functions.get_by_tool("smcogs")[0].description.split(":")[0]
            args.append([cds, index, smcog, smcogs_dir])
        subprocessing.parallel_function(smcog_tree_analysis, args)

    files = glob.glob("*.png")
    tree_filenames = {}
    for filename in files:
        tag = filename.rsplit(".png", 1)[0]
        tree_filenames[tag] = filename
    return tree_filenames
def run_diamond(query_file: str, database_file: str, mode: str = "blastp",
                opts: Optional[List[str]] = None) -> str:
    """ Runs diamond, comparing the given query to the given database

        Arguments:
            query_file: the path of query sequence file
            database_file: the path of the database to compare to
            mode: the mode to use (defaults to blastp)
            opts: any extra options to pass to diamond

        Returns:
            the output from running diamond
    """
    with TemporaryDirectory() as temp_dir:
        command = [
            "diamond",
            mode,
            "--db", database_file,
            "--threads", str(get_config().cpus),
            "--query", query_file,
            "--tmpdir", temp_dir,
        ]
        if opts:
            command.extend(opts)
        result = execute(command)
        if not result.successful():
            raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:]))
    return result.stdout
Пример #24
0
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict,
                                       seq_record, options):
    #Predict PKS KR domain stereochemistry using pattern as published in ClustScan
    krnames = []
    krseqs = []
    logging.info("Predicting PKS KR activity and stereochemistry using KR " \
        "fingerprints from Starcevic et al.")
    for feature in pksnrpscoregenes:
        locus = utils.get_gene_id(feature)
        domaindetails = domaindict[locus]
        nr = 0
        for tab in domaindetails:
            if tab[0] == "PKS_KR":
                nr += 1
                start = int(tab[1])
                end = int(tab[2])
                seq = str(utils.get_aa_sequence(feature))[start:end]
                name = locus + "_KR" + str(nr)
                krnames.append(name)
                krseqs.append(seq)
    if len(krnames) > 0:
        utils.writefasta(
            krnames, krseqs, options.raw_predictions_outputfolder + os.sep +
            "ctg" + str(options.record_idx) + "_krseqs.fasta")
        with TemporaryDirectory(change=True):
            kr_analysis.run_kr_analysis(
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krseqs.fasta",
                options.raw_predictions_outputfolder + os.sep + "ctg" +
                str(options.record_idx) + "_krpredoutput.txt")
    return krnames, krseqs
Пример #25
0
 def test_minimal(self):
     with TemporaryDirectory(change=True) as tempdir:
         self.options = build_config(["--minimal", "--output-dir", tempdir],
                                     isolated=True, modules=antismash.get_all_modules())
         antismash.main.run_antismash(helpers.get_path_to_balhymicin_genbank(),
                                      self.options)
     # make sure it didn't run
     minimock.assert_same_trace(self.tracker, "")
Пример #26
0
 def test_bad_partitions(self):
     with TemporaryDirectory(change=True):
         for i in [-10, -1, 0]:
             with self.assertRaisesRegex(ValueError, "Partitions must be greater than 0"):
                 core.write_fastas_with_all_genes(self.clusters, "test", partitions=i)
         for i in ["str", None, 1.5]:
             with self.assertRaisesRegex(TypeError, "Partitions must be an int greater than 0"):
                 core.write_fastas_with_all_genes(self.clusters, "test", partitions=i)
Пример #27
0
 def test_single_partition(self):
     self.dummy_cluster.cds_children = [DummyCDS(1, 3)] * 3
     with TemporaryDirectory(change=True):
         files = core.write_fastas_with_all_genes(self.clusters, "test.fasta", partitions=1)
         assert files == ["test.fasta"]
         assert os.path.exists("test.fasta")
         expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.clusters)*3))
         assert open("test.fasta").read() == expected
Пример #28
0
def run_and_regenerate_results_for_module(input_file,
                                          module,
                                          options,
                                          expected_record_count=1,
                                          callback=None):
    """ Runs antismash end to end over the given file with the given options
        and returns the given modules regenerated results

        if callback is supplied, it will be called with the output directory path
        as an argument before the output directory is cleared
    """
    with TemporaryDirectory(change=True) as tempdir:
        orig_output = options.output_dir
        update_config({"output_dir": tempdir})
        json_filename = os.path.join(
            options.output_dir,
            os.path.basename(input_file).rsplit('.', 1)[0] + ".json")
        assert not os.path.exists(json_filename)
        try:
            antismash.main.run_antismash(input_file, options)
        except:
            update_config({"output_dir": orig_output})
            raise
        update_config({"output_dir": orig_output})
        results = serialiser.AntismashResults.from_file(
            json_filename, options.taxon)
        # remove things that were added by results, because otherwise the add isn't tested by detection
        # result regeneration
        # this should eventually include every feature and qualifier created by antismash
        for record in results.records:
            record.clear_antismash_domains()
            record.clear_cds_motifs()
        if callback:
            callback(tempdir)
    # not the responsibility of modules, but if it's wrong then everything is
    assert len(results.results) == expected_record_count
    assert len(results.records) == expected_record_count
    # ensure all detection stages add their relevant parts
    modules_to_regenerate = antismash.main.get_detection_modules()
    # don't try and regenerate twice
    if not module in modules_to_regenerate:
        modules_to_regenerate.append(module)
    if expected_record_count == 1:
        regenerated = antismash.main.regenerate_results_for_record(
            results.records[0], options, modules_to_regenerate,
            results.results[0])
        final = regenerated[module.__name__]
        assert isinstance(final, module_results.ModuleResults)
    else:
        regenerated = [
            antismash.main.regenerate_results_for_record(
                record, options, [module], res)
            for record, res in zip(results.records, results.results)
        ]
        final = [result[module.__name__] for result in regenerated]
        for res in final:
            assert isinstance(res, module_results.ModuleResults)
    return final
Пример #29
0
def perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running NCBI BLAST+ subcluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True):
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "subclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "subclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:
                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                write_clusterblast_inputfiles(options, queryclusternames,
                                              queryclusterseqs)
                run_clusterblast_processes(options, searchtype="subclusters")
                blastoutput = read_clusterblast_output(options)
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="subclusters")
                logging.info("   Blast search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                subclusterblastStorage = utils.Storage()
                subclusterblastStorage.clusternumber = clusternumber
                subclusterblastStorage.queryclusterprots = queryclusterprots
                subclusterblastStorage.clusters = clusters
                subclusterblastStorage.hitclusterdata = hitclusterdata
                subclusterblastStorage.rankedclusters = rankedclusters
                subclusterblastStorage.rankedclustervalues = rankedclustervalues
                subclusterblastStorage.proteintags = proteintags
                subclusterblastStorage.proteinlocations = proteinlocations
                subclusterblastStorage.proteinannotations = proteinannotations
                subclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          subclusterblastStorage,
                                          searchtype="subclusters")
Пример #30
0
def perform_knownclusterblast(options: ConfigType, record: Record,
                              reference_clusters: Dict[str, ReferenceCluster],
                              proteins: Dict[str, Protein]) -> GeneralResults:
    """ Run BLAST on gene cluster proteins of each cluster, parse output and
        return result rankings for each cluster

        Only compares clusters to known clusters from the MIBiG database

        Arguments:
            options: antismash Config
            record: the Record to analyse
            clusters: a dictionary mapping reference cluster name to ReferenceCluster
            proteins: a dictionary mapping reference protein name to Protein

        Returns:
            a GeneralResults instance storing results for all clusters in the
            record
    """
    logging.debug("Running DIAMOND knowncluster searches..")
    results = GeneralResults(record.id, search_type="knownclusterblast")

    with TemporaryDirectory(change=True) as tempdir:
        write_fastas_with_all_genes(record.get_clusters(), "input.fasta")
        run_diamond("input.fasta", _get_datafile_path('knownclusterprots'),
                    tempdir, options)
        with open("input.out", 'r') as handle:
            blastoutput = handle.read()
        write_raw_clusterblastoutput(options.output_dir,
                                     blastoutput,
                                     prefix="knownclusterblast")
    clusters_by_number, _ = parse_all_clusters(blastoutput,
                                               record,
                                               min_seq_coverage=40,
                                               min_perc_identity=45)

    core_gene_accessions = get_core_gene_ids(record)
    for cluster in record.get_clusters():
        cluster_number = cluster.get_cluster_number()
        cluster_names_to_queries = clusters_by_number.get(cluster_number, {})
        ranking = score_clusterblast_output(reference_clusters,
                                            core_gene_accessions,
                                            cluster_names_to_queries)
        # store results
        cluster_result = ClusterResult(cluster, ranking, proteins,
                                       "knownclusterblast")
        results.add_cluster_result(cluster_result, reference_clusters,
                                   proteins)

        write_clusterblast_output(options,
                                  record,
                                  cluster_result,
                                  proteins,
                                  searchtype="knownclusterblast")
    results.mibig_entries = mibig_protein_homology(blastoutput, record,
                                                   reference_clusters)
    return results
Пример #31
0
 def test_single_file(self):
     self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
     with TemporaryDirectory(change=True):
         files = core.write_fastas_with_all_genes(self.regions,
                                                  "test.fasta")
         assert files == ["test.fasta"]
         assert os.path.exists("test.fasta")
         expected = "".join(">L{0}\nS{0}\n".format(i)
                            for i in range(len(self.regions) * 3))
         assert open("test.fasta").read() == expected
Пример #32
0
 def test_classifier(self):
     expected = open(path.get_full_path(__file__, "data",
                                        "nisin.txt")).readlines()
     with TemporaryDirectory(change=True):
         results = smcogs.run_on_record(self.record, None, self.options)
         contents = open("smcogs/smcogs.txt").readlines()
         assert contents == expected
         json = results.to_json()
         assert smcogs.SMCOGResults.from_json(json,
                                              self.record).to_json() == json
Пример #33
0
 def run_antismash(self, filename, expected):
     with TemporaryDirectory() as output_dir:
         update_config({"output_dir": output_dir})
         results = helpers.run_and_regenerate_results_for_module(filename, clusterblast, self.options)
         update_config({"output_dir": ""})
         results, global_results = self.get_results(results)
         assert len(results.region_results) == 1
         cluster = results.region_results[0]
         assert len(cluster.ranking) == expected  # will change if database does
         self.check_svgs(global_results, expected, output_dir)
     return results