def test_integron_1elem_int(self):
        """
        Test add_feature when the only element is an integron composed of 1 integrase only.

        """
        infos = {"ID_replicon": self.replicon_id,
                 "ID_integron": "integron_01",
                 "element": "ACBA.007.P01_13_1",
                 "pos_beg": 55,
                 "pos_end": 1014,
                 "strand": 1,
                 "evalue": 1.9e-25,
                 "type_elt": "protein",
                 "annotation": "intI",
                 "model": "intersection_tyr_intI",
                 "type": "complete",
                 "default": "Yes",
                 "distance_2attC": np.nan
                }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_1 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        translate = ("MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE"
                     "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD"
                     "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML"
                     "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST"
                     "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH"
                     "HNKMLRPGLCVVHASPQYL*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"])
        # Check second feature: protein
        self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, "integrase")
        self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"])
        self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"])
        self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
예제 #2
0
    def test_integron_1elem_prom(self):
        """
        Test add_feature when the only element is an integron composed of 1 promoter only.

        """
        infos = {
            "ID_replicon": self.replicon_id,
            "ID_integron": "integron_01",
            "element": "Pc_int1",
            "pos_beg": 25,
            "pos_end": 51,
            "strand": -1,
            "evalue": np.nan,
            "type_elt": "Promoter",
            "annotation": "Pc_1",
            "model": "NA",
            "type": "complete",
            "default": "Yes",
            "distance_2attC": np.nan
        }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_file, self.dist_threshold)

        # Check that there are 2 features (integron and promoter)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"],
                         infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"],
                         infos["type"])
        # Check second feature: promotor
        self.assertEqual(self.seq.features[1].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, infos["type_elt"])
        self.assertEqual(self.seq.features[1].qualifiers["Promoter"],
                         infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["model"],
                         infos["model"])
    def test_integron_1elem_prot(self):
        """
        Test add_feature when the only element is an integron composed of 1 protein only.

        """
        infos = {"ID_replicon": self.replicon_id,
                 "ID_integron": "integron_01",
                 "element": "ACBA.007.P01_13_20",
                 "pos_beg": 17375,
                 "pos_end": 17375,
                 "strand": -1,
                 "evalue": np.nan,
                 "type_elt": "protein",
                 "annotation": "protein",
                 "model": "NA",
                 "type": "complete",
                 "default": "Yes",
                 "distance_2attC": np.nan
                 }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_20 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
                     "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"])
        # Check second feature: protein
        self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, "CDS")
        self.assertEqual(self.seq.features[1].qualifiers["protein_id"], infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["gene"], infos["annotation"])
        self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"])
        self.assertEqual(str(self.seq.features[1].qualifiers["translation"]), translate)
    def test_integron_1elem_prom(self):
        """
        Test add_feature when the only element is an integron composed of 1 promoter only.

        """
        infos = {"ID_replicon": self.replicon_id,
                 "ID_integron": "integron_01",
                 "element": "Pc_int1",
                 "pos_beg": 25,
                 "pos_end": 51,
                 "strand": -1,
                 "evalue": np.nan,
                 "type_elt": "Promoter",
                 "annotation": "Pc_1",
                 "model": "NA",
                 "type": "complete",
                 "default": "Yes",
                 "distance_2attC": np.nan
                 }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_file, self.dist_threshold)

        # Check that there are 2 features (integron and promoter)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"], infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"], infos["type"])
        # Check second feature: promotor
        self.assertEqual(self.seq.features[1].location.start, infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, infos["type_elt"])
        self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["model"], infos["model"])
예제 #5
0
    def test_integron_long_seqname(self):
        """
        Test add_feature when the only element is an integron composed of 1 protein only.

        """
        infos = {
            "ID_replicon": self.replicon_id,
            "ID_integron": "integron_01",
            "element": "ACBA.007.P01_13_20",
            "pos_beg": 17375,
            "pos_end": 17375,
            "strand": -1,
            "evalue": np.nan,
            "type_elt": "protein",
            "annotation": "protein",
            "model": "NA",
            "type": "complete",
            "default": "Yes",
            "distance_2attC": np.nan
        }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id
        seq_name = self.seq.name
        self.seq.name = "abcdefgh" + seq_name

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_20 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        # translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
        #             "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check that sequence name has been shortened
        self.assertEqual(self.seq.name, "h" + seq_name)
    def test_integron_long_seqname(self):
        """
        Test add_feature when the only element is an integron composed of 1 protein only.

        """
        infos = {"ID_replicon": self.replicon_id,
                 "ID_integron": "integron_01",
                 "element": "ACBA.007.P01_13_20",
                 "pos_beg": 17375,
                 "pos_end": 17375,
                 "strand": -1,
                 "evalue": np.nan,
                 "type_elt": "protein",
                 "annotation": "protein",
                 "model": "NA",
                 "type": "complete",
                 "default": "Yes",
                 "distance_2attC": np.nan
                 }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id
        seq_name = self.seq.name
        self.seq.name = "abcdefgh" + seq_name

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_20 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        # translate = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
        #             "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check that sequence name has been shortened
        self.assertEqual(self.seq.name, "h" + seq_name)
예제 #7
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError("the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                                "profile with --path-func-annot option".format(config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning("No hmm profiles for functional annotation detected, skip functional annotation step.")

    if config.gembase_path:
        protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(phageI_file):
                find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(integrons, replicon, config.distance_threshold,
                                             config.model_attc_path,
                                             max_attc_size=config.max_attc_size,
                                             min_attc_size=config.min_attc_size,
                                             circular=circular, out_dir=result_tmp_dir,
                                             cpu=config.cpu,
                                             evalue_attc=config.evalue_attc)
                integron_max.to_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[(integron_max.evalue < config.evalue_attc) &
                                            (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) &
                                            (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info("Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False,
                           columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN'])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db, config.distance_threshold)
                SeqIO.write(replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(result_tmp_dir, str(err)))

    return integron_file, summary_file
예제 #8
0
    def test_integron_1elem_prot(self):
        """
        Test add_feature when the only element is an integron composed of 1 protein only.

        """
        infos = {
            "ID_replicon": self.replicon_id,
            "ID_integron": "integron_01",
            "element": "ACBA.007.P01_13_20",
            "pos_beg": 17375,
            "pos_end": 17375,
            "strand": -1,
            "evalue": np.nan,
            "type_elt": "protein",
            "annotation": "protein",
            "model": "NA",
            "type": "complete",
            "default": "Yes",
            "distance_2attC": np.nan
        }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_20 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        translate = (
            "MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
            "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"],
                         infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"],
                         infos["type"])
        # Check second feature: protein
        self.assertEqual(self.seq.features[1].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, "CDS")
        self.assertEqual(self.seq.features[1].qualifiers["protein_id"],
                         infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["gene"],
                         infos["annotation"])
        self.assertEqual(self.seq.features[1].qualifiers["model"],
                         infos["model"])
        self.assertEqual(str(self.seq.features[1].qualifiers["translation"]),
                         translate)
예제 #9
0
    def test_integron_2int_nelem(self):
        """
        Test add_feature when there are 2 integrons:
            integron 1 with several elements: protein, integrase, promoter
            integron 2 with only 1 attC site
        Integrons are not over the edge of sequence
        """
        # integron 1
        int_id = "integron_01"
        int_type = "complete"
        infos_prom = {
            "ID_replicon": self.replicon_id,
            "ID_integron": int_id,
            "element": "Pc_int1",
            "pos_beg": 25,
            "pos_end": 51,
            "strand": -1,
            "evalue": np.nan,
            "type_elt": "Promoter",
            "annotation": "Pc_1",
            "model": "NA",
            "type": int_type,
            "default": "Yes",
            "distance_2attC": np.nan
        }
        infos_int = {
            "ID_replicon": self.replicon_id,
            "ID_integron": int_id,
            "element": "ACBA.007.P01_13_1",
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.9e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "NA",
            "type": int_type,
            "default": "Yes",
            "distance_2attC": np.nan
        }
        infos_prot = {
            "ID_replicon": self.replicon_id,
            "ID_integron": int_id,
            "element": "ACBA.007.P01_13_20",
            "pos_beg": 2000,
            "pos_end": 2056,
            "strand": -1,
            "evalue": np.nan,
            "type_elt": "protein",
            "annotation": "protein",
            "model": "intersection_tyr_intI",
            "type": int_type,
            "default": "Yes",
            "distance_2attC": np.nan
        }
        # integron 2
        infos_attC = {
            "ID_replicon": self.replicon_id,
            "ID_integron": "integron_02",
            "element": "attc_001",
            "pos_beg": 17825,
            "pos_end": 17884,
            "strand": -1,
            "evalue": 1e-9,
            "type_elt": "attC",
            "annotation": "attC",
            "model": "attc_4",
            "type": int_type,
            "default": "Yes",
            "distance_2attC": np.nan
        }

        df1 = pd.DataFrame(infos_prom, index=[0])
        df2 = pd.DataFrame(infos_int, index=[0])
        df3 = pd.DataFrame(infos_prot, index=[0])
        df4 = pd.DataFrame(infos_attC, index=[0])

        df = pd.concat([df1, df2, df3, df4])

        start_seq = self.seq.seq
        start_id = self.seq.id

        tr_int = (
            "MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE"
            "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD"
            "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML"
            "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST"
            "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH"
            "HNKMLRPGLCVVHASPQYL*")
        tr_prot = (
            "MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
            "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Check that there are 6 features (integron1, promoter, integrase, protein,
        #                                  integron2, attC)
        self.assertEqual(len(self.seq.features), 6)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron1
        self.assertEqual(self.seq.features[0].location.start,
                         infos_prom["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end,
                         infos_prot["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"],
                         int_id)
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"],
                         int_type)
        # Check feature 2: promoter
        self.assertEqual(self.seq.features[1].location.start,
                         infos_prom["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end,
                         infos_prom["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos_prom["strand"])
        self.assertEqual(self.seq.features[1].type, "Promoter")
        self.assertEqual(self.seq.features[1].qualifiers["Promoter"],
                         infos_prom["element"])
        self.assertEqual(self.seq.features[1].qualifiers["model"],
                         infos_prom["model"])
        # Check feature 3: integrase
        self.assertEqual(self.seq.features[2].location.start,
                         infos_int["pos_beg"] - 1)
        self.assertEqual(self.seq.features[2].location.end,
                         infos_int["pos_end"])
        self.assertEqual(self.seq.features[2].strand, infos_int["strand"])
        self.assertEqual(self.seq.features[2].type, "integrase")
        self.assertEqual(self.seq.features[2].qualifiers["protein_id"],
                         infos_int["element"])
        self.assertEqual(self.seq.features[2].qualifiers["gene"],
                         infos_int["annotation"])
        self.assertEqual(self.seq.features[2].qualifiers["model"],
                         infos_int["model"])
        self.assertEqual(str(self.seq.features[2].qualifiers["translation"]),
                         tr_int)
        # Check feature 4: protein
        self.assertEqual(self.seq.features[3].location.start,
                         infos_prot["pos_beg"] - 1)
        self.assertEqual(self.seq.features[3].location.end,
                         infos_prot["pos_end"])
        self.assertEqual(self.seq.features[3].strand, infos_prot["strand"])
        self.assertEqual(self.seq.features[3].type, "CDS")
        self.assertEqual(self.seq.features[3].qualifiers["protein_id"],
                         infos_prot["element"])
        self.assertEqual(self.seq.features[3].qualifiers["gene"],
                         infos_prot["annotation"])
        self.assertEqual(self.seq.features[3].qualifiers["model"],
                         infos_prot["model"])
        self.assertEqual(str(self.seq.features[3].qualifiers["translation"]),
                         tr_prot)
        # Check feature 5: integron2
        self.assertEqual(self.seq.features[4].location.start,
                         infos_attC["pos_beg"] - 1)
        self.assertEqual(self.seq.features[4].location.end,
                         infos_attC["pos_end"])
        self.assertEqual(self.seq.features[4].strand, 0)
        self.assertEqual(self.seq.features[4].type, "integron")
        self.assertEqual(self.seq.features[4].qualifiers["integron_id"],
                         "integron_02")
        self.assertEqual(self.seq.features[4].qualifiers["integron_type"],
                         int_type)
        # Check feature 6: attC
        self.assertEqual(self.seq.features[5].location.start,
                         infos_attC["pos_beg"] - 1)
        self.assertEqual(self.seq.features[5].location.end,
                         infos_attC["pos_end"])
        self.assertEqual(self.seq.features[5].strand, infos_attC["strand"])
        self.assertEqual(self.seq.features[5].type, "attC")
        self.assertEqual(self.seq.features[5].qualifiers["attC"],
                         infos_attC["element"])
        self.assertEqual(self.seq.features[5].qualifiers["model"],
                         infos_attC["model"])
예제 #10
0
    def test_integron_1elem_int(self):
        """
        Test add_feature when the only element is an integron composed of 1 integrase only.

        """
        infos = {
            "ID_replicon": self.replicon_id,
            "ID_integron": "integron_01",
            "element": "ACBA.007.P01_13_1",
            "pos_beg": 55,
            "pos_end": 1014,
            "strand": 1,
            "evalue": 1.9e-25,
            "type_elt": "protein",
            "annotation": "intI",
            "model": "intersection_tyr_intI",
            "type": "complete",
            "default": "Yes",
            "distance_2attC": np.nan
        }

        df = pd.DataFrame(infos, index=[0])

        start_seq = self.seq.seq
        start_id = self.seq.id

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Translation should be protein ACBA.007.P01_13_1 in
        # tests/data/Results_Integron_Finder_acba.007.p01.13/acba.007.p01.13.prt
        translate = (
            "MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE"
            "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD"
            "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML"
            "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST"
            "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH"
            "HNKMLRPGLCVVHASPQYL*")

        # Check that there are 2 features (integron and protein)
        self.assertEqual(len(self.seq.features), 2)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron
        self.assertEqual(self.seq.features[0].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"],
                         infos["ID_integron"])
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"],
                         infos["type"])
        # Check second feature: protein
        self.assertEqual(self.seq.features[1].location.start,
                         infos["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos["strand"])
        self.assertEqual(self.seq.features[1].type, "integrase")
        self.assertEqual(self.seq.features[1].qualifiers["protein_id"],
                         infos["element"])
        self.assertEqual(self.seq.features[1].qualifiers["gene"],
                         infos["annotation"])
        self.assertEqual(self.seq.features[1].qualifiers["model"],
                         infos["model"])
        self.assertEqual(str(self.seq.features[1].qualifiers["translation"]),
                         translate)
예제 #11
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file
    def test_integron_2int_nelem(self):
        """
        Test add_feature when there are 2 integrons:
            integron 1 with several elements: protein, integrase, promoter
            integron 2 with only 1 attC site
        Integrons are not over the edge of sequence
        """
        # integron 1
        int_id = "integron_01"
        int_type = "complete"
        infos_prom = {"ID_replicon": self.replicon_id,
                      "ID_integron": int_id,
                      "element": "Pc_int1",
                      "pos_beg": 25,
                      "pos_end": 51,
                      "strand": -1,
                      "evalue": np.nan,
                      "type_elt": "Promoter",
                      "annotation": "Pc_1",
                      "model": "NA",
                      "type": int_type,
                      "default": "Yes",
                      "distance_2attC": np.nan
                      }
        infos_int = {"ID_replicon": self.replicon_id,
                     "ID_integron": int_id,
                     "element": "ACBA.007.P01_13_1",
                     "pos_beg": 55,
                     "pos_end": 1014,
                     "strand": 1,
                     "evalue": 1.9e-25,
                     "type_elt": "protein",
                     "annotation": "intI",
                     "model": "NA",
                     "type": int_type,
                     "default": "Yes",
                     "distance_2attC": np.nan
                     }
        infos_prot = {"ID_replicon": self.replicon_id,
                      "ID_integron": int_id,
                      "element": "ACBA.007.P01_13_20",
                      "pos_beg": 2000,
                      "pos_end": 2056,
                      "strand": -1,
                      "evalue": np.nan,
                      "type_elt": "protein",
                      "annotation": "protein",
                      "model": "intersection_tyr_intI",
                      "type": int_type,
                      "default": "Yes",
                      "distance_2attC": np.nan
                      }
        # integron 2
        infos_attC = {"ID_replicon": self.replicon_id,
                      "ID_integron": "integron_02",
                      "element": "attc_001",
                      "pos_beg": 17825,
                      "pos_end": 17884,
                      "strand": -1,
                      "evalue": 1e-9,
                      "type_elt": "attC",
                      "annotation": "attC",
                      "model": "attc_4",
                      "type": int_type,
                      "default": "Yes",
                      "distance_2attC": np.nan
                      }

        df1 = pd.DataFrame(infos_prom, index=[0])
        df2 = pd.DataFrame(infos_int, index=[0])
        df3 = pd.DataFrame(infos_prot, index=[0])
        df4 = pd.DataFrame(infos_attC, index=[0])

        df = pd.concat([df1, df2, df3, df4])

        start_seq = self.seq.seq
        start_id = self.seq.id

        tr_int = ("MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFIRFHGVRHPATLGSSE"
                  "VEAFLSWLANERKVSVSTHRQALAALLFFYGKVLCTDLPWLQEIGRPRPSRRLPVVLTPD"
                  "EVVRILGFLEGEHRLFAQLLYGTGMRISEGLQLRVKDLDFDHGTIIVREGKGSKDRALML"
                  "PESLAPSLREQLSRARAWWLKDQAEGRSGVALPDALERKYPRAGHSWPWFWVFAQHTHST"
                  "DPRSGVVRRHHMYDQTFQRAFKRAVEGTVAKLAMRQPFVLFKGLTFQKLCLPGAFRPGDH"
                  "HNKMLRPGLCVVHASPQYL*")
        tr_prot = ("MKGWLFLVIAIVGEVIATSALKSSEGFTKLAPSAVVIIGYGIAFYFLSLVLKSIPVGVAY"
                   "AVWSGLGVVIITAIAWLLHGQKLDAWGFVGMGLIIAAFLLARSPSWKSLRRPTPW*")

        add_feature(self.seq, df, self.prot_db, self.dist_threshold)

        # Check that there are 6 features (integron1, promoter, integrase, protein,
        #                                  integron2, attC)
        self.assertEqual(len(self.seq.features), 6)
        # Check that initial sequence and id are not modified
        self.assertEqual(self.seq.seq, start_seq)
        self.assertEqual(self.seq.id, start_id)
        # Check first feature: integron1
        self.assertEqual(self.seq.features[0].location.start, infos_prom["pos_beg"] - 1)
        self.assertEqual(self.seq.features[0].location.end, infos_prot["pos_end"])
        self.assertEqual(self.seq.features[0].strand, 0)
        self.assertEqual(self.seq.features[0].type, "integron")
        self.assertEqual(self.seq.features[0].qualifiers["integron_id"], int_id)
        self.assertEqual(self.seq.features[0].qualifiers["integron_type"], int_type)
        # Check feature 2: promoter
        self.assertEqual(self.seq.features[1].location.start, infos_prom["pos_beg"] - 1)
        self.assertEqual(self.seq.features[1].location.end, infos_prom["pos_end"])
        self.assertEqual(self.seq.features[1].strand, infos_prom["strand"])
        self.assertEqual(self.seq.features[1].type, "Promoter")
        self.assertEqual(self.seq.features[1].qualifiers["Promoter"], infos_prom["element"])
        self.assertEqual(self.seq.features[1].qualifiers["model"], infos_prom["model"])
        # Check feature 3: integrase
        self.assertEqual(self.seq.features[2].location.start, infos_int["pos_beg"] - 1)
        self.assertEqual(self.seq.features[2].location.end, infos_int["pos_end"])
        self.assertEqual(self.seq.features[2].strand, infos_int["strand"])
        self.assertEqual(self.seq.features[2].type, "integrase")
        self.assertEqual(self.seq.features[2].qualifiers["protein_id"], infos_int["element"])
        self.assertEqual(self.seq.features[2].qualifiers["gene"], infos_int["annotation"])
        self.assertEqual(self.seq.features[2].qualifiers["model"], infos_int["model"])
        self.assertEqual(str(self.seq.features[2].qualifiers["translation"]), tr_int)
        # Check feature 4: protein
        self.assertEqual(self.seq.features[3].location.start, infos_prot["pos_beg"] - 1)
        self.assertEqual(self.seq.features[3].location.end, infos_prot["pos_end"])
        self.assertEqual(self.seq.features[3].strand, infos_prot["strand"])
        self.assertEqual(self.seq.features[3].type, "CDS")
        self.assertEqual(self.seq.features[3].qualifiers["protein_id"], infos_prot["element"])
        self.assertEqual(self.seq.features[3].qualifiers["gene"], infos_prot["annotation"])
        self.assertEqual(self.seq.features[3].qualifiers["model"], infos_prot["model"])
        self.assertEqual(str(self.seq.features[3].qualifiers["translation"]), tr_prot)
        # Check feature 5: integron2
        self.assertEqual(self.seq.features[4].location.start, infos_attC["pos_beg"] - 1)
        self.assertEqual(self.seq.features[4].location.end, infos_attC["pos_end"])
        self.assertEqual(self.seq.features[4].strand, 0)
        self.assertEqual(self.seq.features[4].type, "integron")
        self.assertEqual(self.seq.features[4].qualifiers["integron_id"], "integron_02")
        self.assertEqual(self.seq.features[4].qualifiers["integron_type"], int_type)
        # Check feature 6: attC
        self.assertEqual(self.seq.features[5].location.start, infos_attC["pos_beg"] - 1)
        self.assertEqual(self.seq.features[5].location.end, infos_attC["pos_end"])
        self.assertEqual(self.seq.features[5].strand, infos_attC["strand"])
        self.assertEqual(self.seq.features[5].type, "attC")
        self.assertEqual(self.seq.features[5].qualifiers["attC"], infos_attC["element"])
        self.assertEqual(self.seq.features[5].qualifiers["model"], infos_attC["model"])