def test_check_prereqs_missing_executables(self):
     options = build_config(["--check-prereqs"], isolated=True, modules=get_all_modules())
     update_config({"executables": Namespace()})
     mock("antismash.config.get_config", returns=options)
     assert hasattr(get_config(), "executables")
     assert not get_config().executables.__dict__
     with self.assertRaisesRegex(RuntimeError, "failing prereq"):
         antismash.main.check_prerequisites(get_all_modules(), options)
Exemplo n.º 2
0
def run_diamond(subcommand: str,
                opts: Optional[List[str]] = None) -> RunResult:
    """ Run a diamond subcommand, possibly with further options.

        Arguments:
            subcommand: the diamond subcommand to run
            opts: a list of additional argument strings to pass to diamond

        Returns:
            RunResult of running diamond
    """
    config = get_config()
    with TemporaryDirectory() as temp_dir:
        params = [
            config.cb_diamond_executable,
            subcommand,
            "--threads", str(config.cpus),
            "--tmpdir", temp_dir,
        ]

        if opts:
            params.extend(opts)

        result = execute(params)
        if not result.successful():
            raise RuntimeError("diamond failed to run: %s -> %s" % (subcommand, result.stderr[-100:]))
    return result
Exemplo n.º 3
0
def store_percentage_identities(seq_record):
    clusters = utils.get_cluster_features(seq_record)
    cfg = config.get_config()
    for cluster in clusters:
        features = [
            feature
            for feature in utils.get_cluster_cds_features(cluster, seq_record)
            if 'sec_met' in feature.qualifiers
        ]
        cdhit_table, gene_to_cluster = utils.get_cdhit_table(
            features, float(cfg.cdh_display_cutoff))
        for cdhit_cluster in cdhit_table:
            if len(cdhit_cluster["genes"]) > 1:
                cl_features = [
                    feature for feature in features if utils.get_gene_id(
                        feature) in cdhit_cluster["genes"].keys()
                ]
                pct_table = utils.get_pct_identity_table(cl_features)
                for cds in cl_features:
                    result = ",".join([
                        "%s=%s" %
                        (othercds, pct_table[utils.get_gene_id(cds)][othercds])
                        for othercds in pct_table[utils.get_gene_id(
                            cds)].keys()
                    ])
                    for ann in cds.qualifiers['sec_met']:
                        if ann.startswith("Percentage identity"):
                            del ann
                    cds.qualifiers['sec_met'].append(
                        "Percentage identity: %s" % (result))
Exemplo n.º 4
0
def create_rules_dict(enabled_clustertypes):
    "Create a cluster rules dictionary from the cluster rules file"
    rulesdict = {}
    first = True
    cfg = config.get_config()
    for hmm_model in cfg.enabled_detection_models:
        dir_path = path.dirname(path.abspath(__file__))
        prefix = ""
        if hmm_model != "default":
            dir_path = path.join(dir_path, hmm_model)
            prefix = hmm_model + "/"
        #TODO: We should move all user-customizable files into config subdirectory; the rulefiles are redundant also in hmm_detection_dblookup
        for line in open(path.join(dir_path, "cluster_rules.txt"), "r"):
            # skip the first line with the legend
            if first:
                first = False
                continue
            parts = line.split('\t')
            if len(parts) < 3:
                continue
            key = prefix + parts.pop(0)
            if key not in enabled_clustertypes:
                continue
            rules = parts.pop(0)
            cutoff = int(float(parts.pop(0)) * 1000.00 * cfg.cutoff_multiplier)
            extension = int(
                float(parts.pop(0)) * 1000.00 * cfg.cutoff_multiplier)
            rulesdict[key] = (rules, cutoff, extension)
    return rulesdict
Exemplo n.º 5
0
    def test_canonical_base_filename(self):
        options = build_parser(modules=self.all_modules).parse_args([])
        expected = os.path.join("out", "foo.1_example")
        res = main.canonical_base_filename("foo.1_example.gbk", "out", options)
        assert res == expected
        assert get_config().output_basename == os.path.basename(expected)

        res = main.canonical_base_filename(
            "/some/long/path/foo.1_example.gbff", "out", options)
        assert res == expected

        res = main.canonical_base_filename("foo.1_example.fa", "out", options)
        assert res == expected

        res = main.canonical_base_filename("foo.1_example.gbff.gz", "out",
                                           options)
        assert res == expected

        options = build_parser(modules=self.all_modules).parse_args(
            ["--output-basename", "foo.1"])
        expected = os.path.join("out", "foo.1")
        res = main.canonical_base_filename("foo.1_example.gbk", "out", options)
        assert res == expected

        res = main.canonical_base_filename("foo.1_example.gbff", "out",
                                           options)
        assert res == expected

        res = main.canonical_base_filename("foo.1_example.fa", "out", options)
        assert res == expected

        res = main.canonical_base_filename("foo.1_example.gbff.gz", "out",
                                           options)
        assert res == expected
Exemplo n.º 6
0
    def __init__(self, cluster_feature: secmet.Cluster, ranking: List[Tuple[ReferenceCluster, Score]],
                 reference_proteins: Dict[str, Protein], prefix: str) -> None:
        if ranking:
            assert reference_proteins
        self.prefix = prefix
        self.query_cluster = QueryCluster(cluster_feature)
        query_cluster_number = cluster_feature.get_cluster_number()
        cluster_limit = get_config().cb_nclusters
        self.colour_lookup = build_colour_groups(list(cluster_feature.cds_children), ranking[:cluster_limit])
        self.hits = []  # type: List[Cluster]
        record_prefix = cluster_feature.parent_record.id.split(".", 1)[0]
        num_added = 0
        queries = set()

        for cluster, score in ranking:
            if record_prefix == cluster.accession.split("_", 1)[0]:
                continue
            # determine overall strand direction of hits
            hit_genes = set()
            strand = determine_strand_of_cluster(cluster_feature, score.scored_pairings)
            for query, subject in score.scored_pairings:
                queries.add(query.id)
                hit_genes.add(subject.name)
            svg_cluster = Cluster.from_reference_cluster(cluster, query_cluster_number,
                                                         score, reference_proteins,
                                                         num_added + 1, len(hit_genes),
                                                         strand)
            self.hits.append(svg_cluster)
            num_added += 1
            # obey the cluster display limit from options
            if num_added >= cluster_limit:
                break

        self.max_length = self._size_of_largest_cluster()
        self._organise_strands()
Exemplo n.º 7
0
def prepare_output_directory(name: str, input_file: str) -> None:
    """ Ensure the ouptut directory exists and is usable

        Raises an exception if the directory is unusable,
        or if results not being reused and directory not empty

        Arguments:
            name: the path of the directory
            input_file: the path of the input file

        Returns:
            None
    """
    # if not supplied, set the output directory to be the sequence name
    input_prefix = os.path.basename(canonical_base_filename(input_file, "", get_config()))
    if not name:
        name = os.path.abspath(input_prefix)
        update_config({"output_dir": name})

    if os.path.exists(name):
        if not os.path.isdir(name):
            raise RuntimeError("Output directory %s exists and is not a directory" % name)
        # not empty (apart from a possible input dir), and not reusing its results
        if not input_file.endswith(".json") and \
                list(filter(_ignore_patterns, glob.glob(os.path.join(name, "*")))):
            raise RuntimeError("Output directory contains other files, aborting for safety")

        # --reuse
        logging.debug("Removing existing region genbank files")
        for genbank in glob.glob(os.path.join(name, "*.region???.gbk")):
            os.remove(genbank)
        logging.debug("Reusing output directory: %s", name)
    else:
        logging.debug("Creating output directory: %s", name)
        os.mkdir(name)
Exemplo n.º 8
0
def prepare_data(logging_only: bool = False) -> List[str]:
    """ Prepare the databases. """
    failure_messages = []
    # known
    failure_messages.extend(prepare_known_data(logging_only))

    # general
    clusterblastdir = os.path.join(get_config().database_dir, "clusterblast")
    if "mounted_at_runtime" in clusterblastdir:  # can't prepare these
        return failure_messages
    cluster_defs = os.path.join(clusterblastdir, 'clusters.txt')
    protein_seqs = os.path.join(clusterblastdir, "proteins.fasta")
    db_file = os.path.join(clusterblastdir, "proteins.dmnd")

    # check the DBv3 region info exists instead of single cluster numbers
    with open(protein_seqs) as handle:
        sample = handle.readline()
    if "-" not in sample.split("|", 3)[1]:
        failure_messages.append(
            "clusterblast database out of date, update with download-databases"
        )
        # and don't bother pressing them
        return failure_messages

    failure_messages.extend(
        check_clusterblast_files(cluster_defs,
                                 protein_seqs,
                                 db_file,
                                 logging_only=logging_only))

    return failure_messages
Exemplo n.º 9
0
 def setUp(self):
     options = build_config(
         ["--minimal", "--enable-tta", "--tta-threshold", "0"],
         isolated=True,
         modules=antismash.get_all_modules())
     self.old_config = get_config().__dict__
     self.options = update_config(options)
Exemplo n.º 10
0
def run_hmmpfam2(query_hmmfile: str,
                 target_sequence: str) -> List:  # TODO cleanup
    """ Run hmmpfam2 over the provided HMM file and fasta input

        Arguments:
            query_hmmfile: the HMM file to use
            target_sequence: a string in fasta format of the sequence to run

        Returns:
            a list of results as parsed by SearchIO
    """
    config = get_config()
    command = ["hmmpfam2", "--cpu", str(config.cpus), query_hmmfile, '-']

    # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this
    if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \
            not config.hmmer2.multithreading:
        command = command[0:1] + command[3:]

    result = execute(command, stdin=target_sequence)
    if not result.successful():
        logging.debug('hmmpfam2 returned %d: %r while searching %r',
                      result.return_code, result.stderr, query_hmmfile)
        raise RuntimeError("hmmpfam2 problem while running %s", command)
    res_stream = StringIO(result.stdout)
    results = list(SearchIO.parse(res_stream, 'hmmer2-text'))
    return results
def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None
                 ) -> List[SearchIO._model.query.QueryResult]:  # pylint: disable=protected-access
    """ Run hmmpfam2 over the provided HMM file and fasta input

        Arguments:
            query_hmmfile: the HMM file to use
            target_sequence: a string in fasta format of the sequence to run

        Returns:
            a list of results as parsed by SearchIO
    """
    config = get_config()
    command = ["hmmpfam2"]

    # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this
    if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \
            config.hmmer2.multithreading:
        command.extend(["--cpu", str(config.cpus)])
    if extra_args:
        command.extend(extra_args)
    command.extend([query_hmmfile, '-'])

    result = execute(command, stdin=target_sequence)
    if not result.successful():
        logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code,
                      result.stderr, query_hmmfile)
        raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr))
    res_stream = StringIO(result.stdout)
    return list(SearchIO.parse(res_stream, 'hmmer2-text'))
def run_blastp(target_blastp_database: str, query_sequence: str,
               opts: List[str] = None, results_file: str = None
               ) -> List[SearchIO._model.query.QueryResult]:
    """ Runs blastp over a single sequence against a database and returns the
        results as parsed by Bio.SearchIO.

        Arguments:
            target_blastp_database: the blastp database to compare to
            query_sequence: the sequence being compared
            opts: a list of extra arguments to pass to blastp, or None
            results_file: a path to keep a copy of blastp results in, if provided

        Returns:
            a list of QueryResults as parsed from blast output by SearchIO
    """
    if not query_sequence:
        raise ValueError("Cannot run blastp on empty sequence")

    config = get_config()
    command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database]

    if opts is not None:
        command.extend(opts)

    result = execute(command, stdin=query_sequence)
    if not result.successful():
        raise RuntimeError('blastp returned %d: %r while scanning %r' % (
                           result.return_code, result.stderr.replace("\n", ""),
                           query_sequence[:100]))

    if results_file is not None:
        with open(results_file, 'w') as fh:
            fh.write(result.stdout)

    return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))
Exemplo n.º 13
0
def ensure_database_pressed(filepath: str, return_not_raise: bool = False) -> List[str]:
    """ Ensures that the given HMMer database exists and that the hmmpress
        generated files aren't out of date.

        Arguments:
            filepath: the path to the HMMer database
            return_not_raise: whether to catch errors and return their messages as strings

        Returns:
            any encountered error messages, will never be populated without return_not_raise == True
    """
    components = ["{}{}".format(filepath, ext) for ext in ['.h3f', '.h3i', '.h3m', '.h3p']]

    if path.is_outdated(components, filepath):
        logging.info("%s components missing or obsolete, re-pressing database", filepath)
        if "hmmpress" not in get_config().executables:
            msg = "Failed to hmmpress {!r}: cannot find executable for hmmpress".format(filepath)
            if not return_not_raise:
                raise RuntimeError(msg)
            return [msg]

        result = subprocessing.run_hmmpress(filepath)
        if not result.successful():
            msg = "Failed to hmmpress {!r}: {}".format(filepath, result.stderr)
            if not return_not_raise:
                raise RuntimeError(msg)
            return [msg]
    return []
Exemplo n.º 14
0
def ensure_cds_info(genefinding: Callable[[Record, Any], None],
                    sequence: Record) -> Record:
    """ Ensures the given record has CDS features with unique locus tags.
        CDS features are retrieved from GFF file or via genefinding, depending
        on antismash options.

        Records without CDS features will have their skip flag marked.

        Arguments:
            genefinding: the relevant run_on_record(record, options) function to
                         use for finding genes if no GFF file being used
            record: the Record instance to ensure CDS features for

        Returns:
            the Record instance provided
    """
    options = get_config()
    if sequence.skip:
        return sequence
    if not sequence.get_cds_features():
        if not options.genefinding_gff3 and options.genefinding_tool != "none":
            logging.info(
                "No CDS features found in record %r, running gene finding.",
                sequence.id)
            genefinding(sequence, options)
        if not sequence.get_cds_features():
            logging.info("No genes found, skipping record")
            sequence.skip = "No genes found"
            return sequence
    return sequence
Exemplo n.º 15
0
def check_prereqs() -> List[str]:
    "Check if all required applications are around"
    options = get_config()
    # Tuple is ( binary_name, optional)
    _required_binaries = [
        ('blastp', False),
        ('makeblastdb', False),
        ('diamond', False),
    ]

    _required_files = [
        ('geneclusterprots.dmnd', False),
        ('geneclusterprots.fasta', False),
        ('geneclusters.txt', False),
    ]

    clusterblastdir = os.path.join(options.database_dir, "clusterblast")

    failure_messages = []
    for binary_name, optional in _required_binaries:
        if path.locate_executable(binary_name) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % binary_name)

    for file_name, optional in _required_files:
        if path.locate_file(os.path.join(clusterblastdir,
                                         file_name)) is None and not optional:
            failure_messages.append("Failed to locate file: %r" % file_name)

    failure_messages.extend(check_known_prereqs(options))
    failure_messages.extend(check_sub_prereqs(options))
    return failure_messages
Exemplo n.º 16
0
    def test_classification_with_colon(self):
        # since SMCOG id and description are stored in a string separated by :,
        # ensure that descriptions containing : are properly handled
        # test gene is AQF52_5530 from CP013129.1
        translation = (
            "MDTHQREEDPVAARRDRTHYLYLAVIGAVLLGIAVGFLAPGVAVELKPLGTGFVN"
            "LIKMMISPIIFCTIVLGVGSVRKAAKVGAVGGLALGYFLVMSTVALAIGLLVGNL"
            "LEPGSGLHLTKEIAEAGAKQAEGGGESTPDFLLGIIPTTFVSAFTEGEVLQTLLV"
            "ALLAGFALQAMGAAGEPVLRGIGHIQRLVFRILGMIMWVAPVGAFGAIAAVVGAT"
            "GAAALKSLAVIMIGFYLTCGLFVFVVLGAVLRLVAGINIWTLLRYLGREFLLILS"
            "TSSSESALPRLIAKMEHLGVSKPVVGITVPTGYSFNLDGTAIYLTMASLFVAEAM"
            "GDPLSIGEQISLLVFMIIASKGAAGVTGAGLATLAGGLQSHRPELVDGVGLIVGI"
            "DRFMSEARALTNFAGNAVATVLVGTWTKEIDKARVTEVLAGNIPFDEKTLVDDHA"
            "PVPVPDQRAEGGEEKARAGV")
        cds = helpers.DummyCDS(0, len(translation))
        cds.translation = translation
        results = smcogs.classify("test", [cds], get_config())
        assert results.best_hits[cds.get_name(
        )].hit_id == "SMCOG1212:sodium:dicarboxylate symporter"
        record = helpers.DummyRecord(seq=translation)
        record.add_cds_feature(cds)
        record.add_protocluster(helpers.DummyProtocluster(0, len(translation)))

        # if we don't handle multiple semicolons right, this line will crash
        results.add_to_record(record)
        gene_functions = cds.gene_functions.get_by_tool("smcogs")
        assert len(gene_functions) == 1
        assert str(gene_functions[0]).startswith(
            "transport (smcogs) SMCOG1212:sodium:dicarboxylate symporter"
            " (Score: 416; E-value: 2.3e-126)")
def run_diamond(query_file: str, database_file: str, mode: str = "blastp",
                opts: Optional[List[str]] = None) -> str:
    """ Runs diamond, comparing the given query to the given database

        Arguments:
            query_file: the path of query sequence file
            database_file: the path of the database to compare to
            mode: the mode to use (defaults to blastp)
            opts: any extra options to pass to diamond

        Returns:
            the output from running diamond
    """
    with TemporaryDirectory() as temp_dir:
        command = [
            "diamond",
            mode,
            "--db", database_file,
            "--threads", str(get_config().cpus),
            "--query", query_file,
            "--tmpdir", temp_dir,
        ]
        if opts:
            command.extend(opts)
        result = execute(command)
        if not result.successful():
            raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:]))
    return result.stdout
def parallel_execute(commands: List[List[str]], cpus: Optional[int] = None,
                     timeout: Optional[int] = None, verbose: bool = True) -> List[int]:
    """ Limited return vals, only returns return codes
    """
    if verbose:
        runner = verbose_child_process
    else:
        runner = child_process
    os.setpgid(0, 0)
    if not cpus:
        cpus = get_config().cpus
    assert isinstance(cpus, int)
    pool = multiprocessing.Pool(cpus)
    jobs = pool.map_async(runner, commands)

    try:
        errors = jobs.get(timeout=timeout)
    except multiprocessing.TimeoutError:
        pool.terminate()
        assert isinstance(timeout, int)
        raise RuntimeError("One of %d child processes timed out after %d seconds" % (
                cpus, timeout))

    except KeyboardInterrupt:
        logging.error("Interrupted by user")
        pool.terminate()
        raise

    pool.close()

    return errors
Exemplo n.º 19
0
    def setUp(self):
        options = build_config(self.get_args(), isolated=True, modules=get_all_modules())
        self.old_config = get_config().__dict__
        self.options = update_config(options)

        assert clusterblast.check_prereqs(self.options) == []
        assert clusterblast.check_options(self.options) == []
        assert clusterblast.is_enabled(self.options)
Exemplo n.º 20
0
    def setUp(self):
        self.format0_file = path.get_full_path(__file__, "data", "format0.dmnd")
        self.format1_file = path.get_full_path(__file__, "data", "format1.dmnd")
        self.empty = path.get_full_path(__file__, "data", "empty.dmnd")

        options = build_config([], isolated=True, modules=get_all_modules())
        self.old_config = get_config().__dict__
        self.options = update_config(options)
Exemplo n.º 21
0
    def description_text(self) -> str:
        """ returns the Region description """
        description_text = 'Location: {:,d} - {:,d} nt. (total: {:,d} nt)'.format(
            self.location.start + 1, self.location.end, len(self.location))
        if get_config().cf_create_clusters and self.probabilities:
            description_text += 'ClusterFinder probabilities: %s. ' % self.probabilities

        return description_text
Exemplo n.º 22
0
    def description_text(self) -> str:
        """ returns the Region description """
        description_text = 'Location: %s - %s nt. ' % (self.location.start + 1,
                                                       self.location.end)
        if get_config().cf_create_clusters and self.probabilities:
            description_text += 'ClusterFinder probabilities: %s. ' % self.probabilities

        return description_text
Exemplo n.º 23
0
 def test_namespace_initialisation(self):
     # test intialisation from namespace
     namespace = Namespace()
     namespace.taxon = 'fungi'
     config = update_config(namespace)
     assert config.taxon == 'fungi'
     # a new constructor should keep the value
     assert get_config().taxon == 'fungi'
Exemplo n.º 24
0
def load_clusterblast_database(seq_record, searchtype="general"):
    options = config.get_config()
    accessiondict = {}
    for cds in utils.get_cds_features(seq_record):
        accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds)
    clusters = load_geneclusters(searchtype)
    proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype)
    return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
Exemplo n.º 25
0
def _ignore_patterns(entry: str) -> bool:
    """File name patterns that we want to ignore for the "outdir is empty" check."""
    config = get_config()
    if entry.endswith('/input') and os.path.isdir(entry):
        return False
    if os.path.abspath(entry) == os.path.abspath(config.logfile):
        return False

    return True
Exemplo n.º 26
0
    def from_json(json: Dict[str, Any], record: Record) -> Optional["TTAResults"]:
        """ Constructs a new TTAResults instance from a json format and the
            original record analysed.
        """
        if json["schema_version"] != TTAResults.schema_version:
            return None

        options = get_config()
        results = TTAResults(json["record_id"], json["gc_content"], options.tta_threshold)
        # if old results were excluding based on too low a GC content, rerun
        if json["threshold"] > results.gc_content and options.tta_threshold <= results.gc_content:
            return None
        # otherwise, if the threshold is now too high, skip all the codons
        if json["gc_content"] >= get_config().tta_threshold:
            for info in json["TTA codons"]:
                start = info["start"]
                strand = info["strand"]
                results.new_feature_from_basics(start, strand)
        return results
Exemplo n.º 27
0
 def _size_of_largest_cluster(self) -> int:
     query_length = len(self.query_cluster)
     length = query_length
     for cluster in self.hits:
         if len(cluster) > length:
             length = len(cluster)
     min_scale = get_config().cb_min_homology_scale
     # if this would shrink the query too much, use the minimum allowed
     if query_length / length < min_scale:
         length = int(query_length / min_scale)
     return length
Exemplo n.º 28
0
    def setUp(self):
        options = build_config(self.get_args(),
                               isolated=True,
                               modules=antismash.get_all_modules())
        self.old_config = get_config().__dict__
        self.options = update_config(options)

        self.record = self.build_record(
            helpers.get_path_to_nisin_with_detection())

        prepare_data()
Exemplo n.º 29
0
 def test_from_json_higher_bitscore(self):
     json = self.create_results().to_json()
     assert get_config().rre_cutoff == 25.
     new = 35.
     assert self.hits[0].score > new
     assert self.hits[1].score < new
     update_config({"rre_cutoff": new})
     result = RREFinderResults.from_json(json, self.record)
     assert len(result.hits_by_cds) == 1
     assert result.hits_by_cds[self.hits[0].locus_tag] == [self.hits[0]]
     assert len(result.hits_by_protocluster) == 1
     assert result.hits_by_protocluster[1] == [self.hits[0].locus_tag]
Exemplo n.º 30
0
 def test_from_json_higher_min_length(self):
     json = self.create_results().to_json()
     assert get_config().rre_min_length == 50
     new = 80
     assert len(self.hits[0]) < new
     assert len(self.hits[1]) > new
     update_config({"rre_min_length": new})
     results = RREFinderResults.from_json(json, self.record)
     assert len(results.hits_by_cds) == 1
     assert results.hits_by_cds[self.hits[1].locus_tag] == [self.hits[1]]
     assert len(results.hits_by_protocluster) == 1
     assert results.hits_by_protocluster[2] == [self.hits[1].locus_tag]