예제 #1
0
 def test_find_attc_no_infernal(self):
     cmsearch_bin = 'foo'
     replicon_name = 'acba.007.p01.13'
     replicon_path = os.path.join(self._data_dir, 'Replicons', replicon_name + '.fst')
     with self.assertRaises(RuntimeError) as ctx:
         infernal.find_attc(replicon_path, self.replicon_name, cmsearch_bin, self.tmp_dir, self.model_attc)
     self.assertTrue(re.search("failed : \[Errno 2\] No such file or directory: 'foo'", str(ctx.exception)),
                     msg=str(ctx.exception))
예제 #2
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError("the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                                "profile with --path-func-annot option".format(config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning("No hmm profiles for functional annotation detected, skip functional annotation step.")

    if config.gembase_path:
        protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(phageI_file):
                find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(integrons, replicon, config.distance_threshold,
                                             config.model_attc_path,
                                             max_attc_size=config.max_attc_size,
                                             min_attc_size=config.min_attc_size,
                                             circular=circular, out_dir=result_tmp_dir,
                                             cpu=config.cpu,
                                             evalue_attc=config.evalue_attc)
                integron_max.to_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[(integron_max.evalue < config.evalue_attc) &
                                            (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) &
                                            (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info("Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False,
                           columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN'])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db, config.distance_threshold)
                SeqIO.write(replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(result_tmp_dir, str(err)))

    return integron_file, summary_file
예제 #3
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file
예제 #4
0
 def test_find_attc_no_model(self):
     model_attc = 'foo'
     with self.assertRaises(RuntimeError) as ctx:
         infernal.find_attc(self.replicon_path, self.replicon_name, self.cmsearch_path, self.tmp_dir, model_attc)
     self.assertTrue(str(ctx.exception).endswith('failed returncode = 1'))
예제 #5
0
 def test_find_attc(self):
     infernal.find_attc(self.replicon_path, self.replicon_name, self.cmsearch_path, self.tmp_dir, self.model_attc)
     for suffix in ('_attc.res', '_attc_table.res'):
         res = os.path.join(self.tmp_dir, self.replicon_name + suffix)
         self.assertTrue(os.path.exists(res))