예제 #1
0
    def setUp(self):
        asv_table_str = """variant_id	marker_name	run_name	sequence_length	read_count	sample1	sample2	sample3	chimera_borderline	sequence
3	MFZR	prerun	176	9713	9712	1	0	FALSE	TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
33	MFZR	prerun	174	9713	9703	10	0	FALSE	CTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
333	ZFZR	prerun	157	10000	9900	10	0	FALSE	TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
836	MFZR	prerun	176	11588	123	56	0	FALSE	TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
8368	ZFZR	prerun	157	545	500	0	45	FALSE	TGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
83683	MFZR	prerun	175	484	0	28	456	FALSE	TCTAAATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT
"""
        asv_table_df = pandas.read_csv(io.StringIO(asv_table_str),
                                       sep="\t",
                                       header=0)
        self.asv_table_df = asv_table_df
        # Create this_tempdir
        this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                    os.path.basename(__file__))
        pathlib.Path(this_tempdir).mkdir(exist_ok=True)
        # Define fasta_path tsv_path
        fasta_path = os.path.join(PathManager.instance().get_tempdir(),
                                  os.path.basename(__file__), 'variants.fa')
        # Create variant variant_read_count_input_df
        variant_df = asv_table_df[['variant_id', 'sequence', 'read_count'
                                   ]].drop_duplicates(inplace=False)
        variant_df.columns = ['id', 'sequence', 'size']
        # Create fasta_path file from asv_table_df
        variant_df_utils = DataframeVariant(variant_df)
        variant_df_utils.to_fasta(fasta_path, add_column='size')
        # Define vsearch output tsv_path
        vsearch_output_path = os.path.join(
            PathManager.instance().get_tempdir(), os.path.basename(__file__),
            'centroid_out.fa')
        # Define cluster output tsv_path
        vsearch_cluster_output_path = os.path.join(
            PathManager.instance().get_tempdir(), os.path.basename(__file__),
            'cluster.fa')
        #
        # Create object and run_name vsearch
        os.environ["VTAM_THREADS"] = "1"
        vsearch_parameters = {
            '--cluster_size': fasta_path,
            '--clusters': vsearch_cluster_output_path,
            '--id': 1,
            '--sizein': None,
            '--centroids': vsearch_output_path,
            "--threads": int(os.getenv('VTAM_THREADS')),
        }
        vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters)
        vsearch_cluster.run()
예제 #2
0
    def setUp(self):
        """>parent1;size=650
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG
>parent2;size=700
AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAA
CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera1;size=50
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera2;size=300
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTG
CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG
>Chimera3;size=50
TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG"""
        """(vtam_appli) gonzalez@milan:~/tmp/vsearch_uchime$ vsearch --uchime_denovo i.fa --borderline borderline.fa --nonchimeras nonchimeras.fa --chimeras chimeras.fa
vsearch v2.7.0_linux_x86_64, 15.5GB RAM, 8 cores
https://github.com/torognes/vsearch

Reading file i.fa 100%
1500 nt in 5 seqs, min 300, max 300, avg 300
Masking 100%
Sorting by abundance 100%
Counting k-mers 100%
Detecting chimeras 100%
Found 2 (40.0%) chimeras, 2 (40.0%) non-chimeras,
and 1 (20.0%) borderline sequences in 5 unique sequences.
Taking abundance information into account, this corresponds to
350 (20.0%) chimeras, 1350 (77.1%) non-chimeras,
and 50 (2.9%) borderline sequences in 1750 total sequences"""

        # Input from min_replicate_number
        # Variants 1 and 2 are ok but 3-5 are chimeras
        self.variant_df = pandas.DataFrame(
            data={
                'sequence': [
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG',
                    'AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAACAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                ],
            },
            index=list(range(1, 6)),
        )
        #
        self.variant_read_count_df = pandas.DataFrame({
            'run_id': [1] * 5,
            'marker_id': [1] * 5,
            'sample_id': [1] * 5,
            'replicate': [1] * 5,
            'variant_id':
            list(range(1, 6)),
            'read_count': [650, 700, 50, 350, 50],
        })
        self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                         os.path.basename(__file__))
        pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
        os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())
예제 #3
0
def pip_install_vtam_for_tests():
    """This function is used in the tests when the vtam command is run"""

    cmd = '{} -m pip install . -q --upgrade'.format(sys.executable)
    if sys.platform.startswith("win"):
        args = cmd
    else:
        args = shlex.split(cmd)
    subprocess.run(args=args,
                   check=True,
                   cwd=PathManager.instance().get_project_path())
예제 #4
0
    def __init__(self, variant_fasta, blast_db_dir, blast_db_name, num_threads,
                 qcov_hsp_perc):

        self.variant_fasta = variant_fasta
        self.blast_db_dir = blast_db_dir
        self.blast_db_name = blast_db_name
        # self.ltg_rule_threshold = ltg_rule_threshold
        # self.include_prop = include_prop
        # self.min_number_of_taxa = min_number_of_taxa
        self.num_threads = num_threads
        self.qcov_hsp_perc = qcov_hsp_perc

        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True, parents=True)
예제 #5
0
    def __init__(self, variant_expected_df, variant_unexpected_df,
                 variant_read_count_df):
        """
        Initiates object for the PCR error filter

        :param variant_expected_df: DataFrame (id, sequence) with expected variants
        :param variant_unexpected_df: DataFrame (id, sequence) with unexpected variants
        :param variant_read_count_df: DataFrame (run_id, marker_id, sample_id, replicate, variant_id, read_count)
        """
        self.__variant_expected_df = variant_expected_df
        self.__variant_unexpected_df = variant_unexpected_df
        self.__variant_read_count_df = variant_read_count_df
        self.__tmp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                      self.__class__.__name__)
        pathlib.Path(self.__tmp_dir).mkdir(parents=True, exist_ok=True)
예제 #6
0
    def compute_clusters(self):

        tempcluster_dir = PathManager.instance().get_tempdir()

        i_fas = os.path.join(tempcluster_dir, 'cluster_input.fas')
        with open(i_fas, 'w') as fout:
            for idx, row in self.variant_info_df.iterrows():
                valdict = {}
                valdict['variant_id'] = row.variant_id
                valdict['read_count'] = row.read_count
                valdict['sequence'] = row.sequence
                fout.write(
                    ">{variant_id};size={read_count}\n{sequence}\n".format(
                        **valdict))
        cmd = "vsearch --cluster_size cluster_input.fas --id {} --otutabout otutabout.txt --clusters test".format(
            self.cluster_identity)
        if sys.platform.startswith("win"):
            args = cmd
        else:
            args = shlex.split(cmd)
        subprocess.run(args=args, cwd=tempcluster_dir)

        otutabout_path = os.path.join(tempcluster_dir, "otutabout.txt")
        otutabout_df = pandas.read_csv(otutabout_path, sep="\t")
        otutabout_df.rename({'#OTU ID': 'centroid'}, axis=1, inplace=True)

        otutabout_long_df = pandas.melt(otutabout_df,
                                        id_vars=['centroid'],
                                        var_name='variant_id',
                                        value_name='read_count')
        otutabout_long_df.rename({'centroid': 'clusterid'},
                                 axis=1,
                                 inplace=True)
        otutabout_long_df = otutabout_long_df.loc[
            otutabout_long_df.read_count > 0]
        otutabout_long_df.variant_id = otutabout_long_df.variant_id.astype(
            'int')

        cluster_count_df = otutabout_long_df[['clusterid', 'variant_id'
                                              ]].groupby('clusterid').count()
        cluster_count_df.rename({'variant_id': 'clustersize'},
                                axis=1,
                                inplace=True)
        cluster_count_df = otutabout_long_df[['clusterid', 'variant_id'
                                              ]].merge(cluster_count_df,
                                                       on='clusterid')

        return cluster_count_df
예제 #7
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################

        # Input file paths
        known_occurrences_tsv = self.input_file(
            OptimizePCRerror.__input_file_known_occurrences)
        fasta_info_tsv = self.input_file(
            OptimizePCRerror.__input_file_sortedinfo)
        #
        # Output file paths
        output_optimize_path = self.output_file(
            OptimizePCRerror.__output_file_optimize_pcr_error)

        ############################################################################################
        #
        # Get nijk_df, known_occurrences_df
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)
        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            VariantReadCount, engine=engine)

        known_occurrences_df = FileKnownOccurrences(
            known_occurrences_tsv).to_identifier_df(engine)

        ############################################################################################
        #
        # Run optimizer and Write
        #
        ############################################################################################

        optimize_pcr_error_runner = RunnerOptimizePCRerror(
            variant_read_count_df=variant_read_count_df,
            known_occurrences_df=known_occurrences_df)
        optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path,
                                         engine=engine)
예제 #8
0
    def __init__(self, asv_table_df, readcounts, run_marker_df=None):
        """
        Constructor of the CommandPoolRunMarkers class

        Parameters
        ----------
        asv_table_df : pandas dataframe
            ASV table.
        readcount : bool
            Default false.
            If false, boolean 0/1 is given for presence or absence of variant in pooled table.
            If true, read integer is given with sum or reads in the pooled runs or markers.
        run_marker_df: pandas dataframe
            Output ASV table with pooled variants
        """

        header = {
            'run_name', 'marker_name', 'variant_id', 'sequence_length',
            'read_count'
        }
        if not set(asv_table_df.columns
                   ) >= header:  # contains at least the 'header_lower' columns
            Logger.instance().error(
                VTAMexception(
                    "The ASV table structure is wrong. It is expected to contain these columns: "
                    "run_name, marker_name, variant_id, sequence_length, read_count"
                ))
            sys.exit(1)

        self.sample_names = asv_table_df.columns.tolist()[5:-2]

        if run_marker_df is None:  # Default: pool all marker_name
            self.asv_table_df = asv_table_df
        else:  # if run_marker_df: pool only markers in this variant_read_count_input_df
            self.asv_table_df = asv_table_df.merge(
                run_marker_df, on=['run_name', 'marker_name'])

        self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                    os.path.basename(__file__))
        pathlib.Path(self.tmp_dir).mkdir(exist_ok=True)

        self.cluster_path = None  # returned by run_vsearch_to_cluster_sequences

        self.cluster_df = None  # returned by get_vsearch_clusters_to_df
        self.readcounts = readcounts  # returned by get_vsearch_clusters_to_df
예제 #9
0
    def __init__(self, command, cli_args_dic):
        """

        :param command: takes one of two values: filter or optimize
        :param cli_args_dic: dictionnary (CLIargumentDict.instance()) with command
        """

        self.command = command
        self.cli_args_and_numerical_params = {}
        self.cli_args_and_numerical_params.update(cli_args_dic)

        # Add user params_lfn_variant.yml parameters

        params_dic = FileParams(cli_args_dic['params']).get_params_dic()
        self.cli_args_and_numerical_params.update(params_dic)

        self.wopfile_path = None
        self.tempdir = PathManager.instance().get_tempdir()
예제 #10
0
    def __init__(self, taxonomy_tsv=None):
        """

        :param taxonomy_tsv: Path to the taxonomy_tsv. Default None
        :type taxonomy_tsv: str

        :rtype: None
        """

        if taxonomy_tsv is None:  # If None, download to current wdir
            self.taxonomy_tsv_path = os.path.join(os.getcwd(), "taxonomy.tsv")
        else:  # Download to tsv_path
            self.taxonomy_tsv_path = taxonomy_tsv

        pathlib.Path(os.path.dirname(taxonomy_tsv)).mkdir(parents=True,
                                                          exist_ok=True)

        self.tempdir = PathManager.instance().get_tempdir()

        package_path = os.path.join(PathManager.get_package_path())
        self.taxonomy_tsv_gz_path = os.path.join(package_path, "..", "data",
                                                 "taxonomy.tsv.gz")
예제 #11
0
    def setUp(self):
        os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())

        # Input from min_replicate_number
        self.variant_df = pandas.DataFrame(
            {
                'sequence': [
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGATGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                    'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATCAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG',
                ],
            },
            index=list(range(1, 5)))
        #
        self.variant_read_count_df = pandas.DataFrame({
            'run_id': [1] * 8,
            'marker_id': [1] * 8,
            'sample_id': [1] * 8,
            'replicate': [1, 2] * 4,
            'variant_id': [1] * 2 + [2] * 2 + [3] * 2 + [4] * 2,
            'read_count': [
                350,
                300,
                300,
                220,
                60,
                0,
                2,
                0,
            ],
        })

        self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(),
                                         os.path.basename(__file__))
        pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
예제 #12
0
    def run(self):
        session = self.session
        engine = session._session().get_bind()

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        ############################################################################################
        #
        # Wrapper inputs, outputs and parameters
        #
        ############################################################################################
        #
        # Input file output
        fasta_info_tsv = self.input_file(
            FilterPCRerror.__input_file_sortedinfo)
        #
        # Input table models
        input_filter_min_replicate_model = self.input_table(
            FilterPCRerror.__input_table_filter_min_replicate_number)
        #
        # Options
        pcr_error_var_prop = self.option("pcr_error_var_prop")
        #
        # Output table models
        output_filter_pcr_error_model = self.output_table(
            FilterPCRerror.__output_table_filter_pcr_error)

        ############################################################################################
        #
        # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis
        # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model
        # 3. Get nijk_df input
        #
        ############################################################################################

        sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv)

        sample_info_tsv_obj.delete_from_db(
            engine=engine,
            variant_read_count_like_model=output_filter_pcr_error_model)

        variant_read_count_df = sample_info_tsv_obj.get_nijk_df(
            variant_read_count_like_model=input_filter_min_replicate_model,
            engine=engine,
            filter_id=None)

        ############################################################################################
        #
        # Run per sample_id
        #
        ############################################################################################

        variant_df = sample_info_tsv_obj.get_variant_df(
            variant_read_count_like_model=input_filter_min_replicate_model,
            engine=engine)

        record_list = []

        run_marker_sample_df = variant_read_count_df[[
            'run_id', 'marker_id', 'sample_id'
        ]].drop_duplicates()
        for row in run_marker_sample_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id

            # Get variant read for the current run-marker-sample
            variant_read_count_per_sample_df = variant_read_count_df.loc[
                (variant_read_count_df.run_id == run_id)
                & (variant_read_count_df.marker_id == marker_id) &
                (variant_read_count_df.sample_id == sample_id)]

            variant_per_sample_df = variant_df.loc[variant_df.index.isin(
                variant_read_count_per_sample_df.variant_id.unique().tolist())]
            this_step_tmp_per_sample_dir = os.path.join(
                this_temp_dir,
                "run_{}_marker_{}_sample{}".format(run_id, marker_id,
                                                   sample_id))
            pathlib.Path(this_step_tmp_per_sample_dir).mkdir(exist_ok=True)

            ########################################################################################
            #
            # Run vsearch and get alignement variant_read_count_input_df
            #
            ########################################################################################

            filter_pcr_error_runner = RunnerFilterPCRerror(
                variant_expected_df=variant_per_sample_df,
                variant_unexpected_df=variant_per_sample_df,
                variant_read_count_df=variant_read_count_per_sample_df)
            filter_output_per_sample_df = filter_pcr_error_runner.get_variant_read_count_delete_df(
                pcr_error_var_prop)

            ########################################################################################
            #
            # Per sample add to record list
            #
            ########################################################################################

            record_per_sample_list = ModelVariantReadCountLike.filter_delete_df_to_dict(
                filter_output_per_sample_df)
            record_list = record_list + record_per_sample_list

        variant_read_count_delete_df = pandas.DataFrame.from_records(
            data=record_list)

        ############################################################################################
        #
        # 5. Write to DB
        # 6. Touch output tables, to update modification date
        # 7. Exit vtam if all variants delete
        #
        #######################################################################

        DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql(
            engine=engine,
            variant_read_count_like_model=output_filter_pcr_error_model)

        for output_table_i in self.specify_output_table():
            declarative_meta_i = self.output_table(output_table_i)
            obj = session.query(declarative_meta_i).order_by(
                declarative_meta_i.id.desc()).first()
            session.query(declarative_meta_i).filter_by(id=obj.id).update(
                {'id': obj.id})
            session.commit()

        if variant_read_count_delete_df.filter_delete.sum(
        ) == variant_read_count_delete_df.shape[0]:
            Logger.instance().warning(
                VTAMexception("This filter has deleted all the variants: {}. "
                              "The analysis will stop here.".format(
                                  self.__class__.__name__)))
            sys.exit(0)
예제 #13
0
    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)
예제 #14
0
    def setUp(self):

        self.tempdir = PathManager.instance().get_tempdir()
        pathlib.Path(self.tempdir).mkdir(parents=True, exist_ok=True)
예제 #15
0
    def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name,
                 num_threads, params):
        """

        Parameters
        ----------
        sequence_list : list
            List of se
        param2 : str
            The second parameter.

        """

        self.old_tax_id_df = taxonomy.old_tax_df
        self.taxonomy_df = taxonomy.df
        self.blast_db_dir = blast_db_dir
        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True)

        self.num_threads = num_threads

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        params_dic = FileParams(params).get_params_dic()
        qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # 2 Create FASTA file with Variants
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Create SortedReadFile from Variants".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta')
        with open(variant_fasta, 'w') as fout:
            for seq in sequence_list:
                fout.write(">{}\n{}\n".format(seq, seq))

        #######################################################################
        #
        # 3 Run local blast
        #
        #######################################################################

        runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name,
                                   num_threads, qcov_hsp_perc)
        # run blast
        blast_output_tsv = runner_blast.run_local_blast()
        # process blast results
        blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv)

        #######################################################################
        #
        # Compute tax lineages for Blast target tax ids
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Open taxonomy.tsv DB".format(
                __file__,
                inspect.currentframe().f_lineno))
        blast_output_df.target_tax_id = pandas.to_numeric(
            blast_output_df.target_tax_id)
        #
        Logger.instance().debug(
            "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        tax_id_list = blast_output_df.target_tax_id.unique().tolist()
        tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages(
            tax_id_list)

        #######################################################################
        #
        # Merge tax lineages and the blast result
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Merge blast result including tax_id with their lineages"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        # Merge local blast output with tax_id_to_lineage_df
        # variant_identity_lineage_df = blast_output_df.merge(
        #     tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id')
        variantid_identity_lineage_df = blast_output_df.merge(
            tax_id_to_lineage_df, left_on='target_tax_id', right_index=True)
        # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True)
        """(Pdb) variant_identity_lineage_df.columns  
Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage',
       'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order',
       'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom',
       'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass',
       'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder',
       'superorder', 'subcohort', 'superclass', 'species group', 'subtribe',
       'section', 'varietas', 'species subgroup'],
      dtype='object')"""

        #######################################################################
        #
        #  several_variants_to_ltg
        # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Main loop over variant and identity to"
            "compute the whole set of ltg_tax_id and ltg_rank for each variant_id"
            "to a dataframe".format(__file__,
                                    inspect.currentframe().f_lineno))
        runner_ltg_selection = RunnerLTGselection(
            variant_identity_lineage_df=variantid_identity_lineage_df,
            taxonomy_df=self.taxonomy_df,
            params=params)
        self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
예제 #16
0
파일: __init__.py 프로젝트: ulysse06/vtam
    def __init__(self, sys_argv):

        ############################################################################################
        #
        # Parse arguments
        #
        ############################################################################################

        self.sys_argv = sys_argv
        # AG do not use abspath for the moment. Maybe later it can be used as
        # option
        parser = ArgParser.get_main_arg_parser()
        self.args = parser.parse_args(sys_argv)

        arg_parser_dic = vars(self.args)

        ############################################################################################
        #
        # If non-specified, initiate params.yml
        #
        ############################################################################################

        if 'params' in arg_parser_dic and arg_parser_dic['params'] is None:
            params_yml = os.path.join(PathManager.instance().get_configdir(), "params.yml")
            if not os.path.isfile(params_yml):
                pathlib.Path(params_yml).touch(exist_ok=False)
            arg_parser_dic['params'] = params_yml

        ############################################################################################
        #
        # Parse log arguments
        #
        ############################################################################################

        if 'log_verbosity' in arg_parser_dic:
            (LoggerArguments.instance()).update({'log_verbosity': arg_parser_dic['log_verbosity']})
            os.environ['VTAM_LOG_VERBOSITY'] = str(
                arg_parser_dic['log_verbosity'])

        if 'log' in arg_parser_dic:
            (LoggerArguments.instance()).update({'log': arg_parser_dic['log']})
            os.environ['VTAM_LOG_FILE'] = str(arg_parser_dic['log'])

        #######################################################################
        #
        # Set arguments, logger
        #
        #######################################################################

        # Some arguments will be passed through environmental variables
        if 'threads' in arg_parser_dic:
            os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads'])

        ############################################################################################
        #
        # Subcommands: wopfile-dependent, filter, optimize
        #
        ############################################################################################

        if arg_parser_dic['command'] in ['filter', 'optimize']:

            if arg_parser_dic['command'] in ['filter']:

                ####################################################################################
                #
                # Verify coherence of --lfn_variant_replicate and params arguments
                #
                ####################################################################################

                with open(arg_parser_dic['params']) as fin:
                    # The FullLoader parameter handles the conversion from YAML
                    # scalar values to Python the dictionary format
                    params_dic = yaml.load(fin, Loader=yaml.SafeLoader) or {}

                    if arg_parser_dic['lfn_variant_replicate']:
                        if 'lfn_variant_cutoff' in params_dic:
                            Logger.instance().error(VTAMexception(
                                'The parameter "lfn_variant_cutoff" in the parameter file "{}" is incompatible with'
                                ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params'])))
                            sys.exit(1)

                    else:
                        if 'lfn_variant_replicate_cutoff' in params_dic:
                            Logger.instance().error(VTAMexception(
                                'The parameter "lfn_variant_replicate_cutoff" in the parameter file "{}" needs'
                                ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params'])))
                            sys.exit(1)

                ####################################################################################
                #
                # Verify coherence of --lfn_variant_replicate and cutoff_specific argument
                #
                ####################################################################################

                if not (arg_parser_dic['cutoff_specific'] is None):  # cutoff specific argument

                    if arg_parser_dic['lfn_variant_replicate']:  # lfn_variant_replicate

                        # cutoff_specific for lfn_variant
                        if not FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate():
                            Logger.instance().error('The --lfn_variant_replicate argument is incompatible with the cutoff_specific file {}.'.format(
                                    arg_parser_dic['cutoff_specific']))
                            sys.exit(1)

                    else: # lfn_variant

                        # cutoff_specific for lfn_variant_replicate
                        if FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate():

                            Logger.instance().error('The cutoff_specific file {} requires the --lfn_variant_replicate argument.'.format(
                                    arg_parser_dic['cutoff_specific']))
                            sys.exit(1)

                ############################################################################################
                #
                # If non-specified, initiate cutoff specific
                #
                ############################################################################################

                if arg_parser_dic['cutoff_specific'] is None:
                    cutoff_specific_tsv = os.path.join(PathManager.instance().get_configdir(),
                                                       "cutoff_specific.tsv")
                    if not os.path.isfile(cutoff_specific_tsv):
                        pathlib.Path(cutoff_specific_tsv).touch(exist_ok=False)
                    arg_parser_dic['cutoff_specific'] = cutoff_specific_tsv

            CommandFilterOptimize.main(arg_parser_dic=arg_parser_dic)

        ############################################################################################
        #
        # Subcommand: example
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'example':
            outdir = arg_parser_dic['outdir']
            CommandExample.main(outdir=outdir)

        ############################################################################################
        #
        # Subcommand: merge
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'merge':
            fastqinfo = arg_parser_dic['fastqinfo']
            fastqdir = arg_parser_dic['fastqdir']
            fastainfo = arg_parser_dic['fastainfo']
            fastadir = arg_parser_dic['fastadir']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandMerge.main(fastqinfo=fastqinfo, fastqdir=fastqdir, fastainfo=fastainfo,
                              fastadir=fastadir, params=params, num_threads=num_threads)

        ############################################################################################
        #
        # Subcommand: sortreads
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'sortreads':
            fastadir = arg_parser_dic['fastadir']
            fastainfo = arg_parser_dic['fastainfo']
            sorteddir = arg_parser_dic['sorteddir']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandSortReads.main(fastainfo=fastainfo, fastadir=fastadir, params=params,
                                  num_threads=num_threads, sorteddir=sorteddir)

        ############################################################################################
        #
        # Subcommand: taxassign
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'taxassign':
            db = arg_parser_dic['db']
            asvtable_tsv = arg_parser_dic['asvtable']
            output = arg_parser_dic['output']
            mode = arg_parser_dic['mode']
            taxonomy_tsv = arg_parser_dic['taxonomy']
            blasdb_dir_path = arg_parser_dic['blastdbdir']
            blastdbname_str = arg_parser_dic['blastdbname']
            num_threads = arg_parser_dic['threads']
            params = arg_parser_dic['params']
            CommandTaxAssign.main(db=db, mode=mode, asvtable_tsv=asvtable_tsv, output=output,
                                  taxonomy_tsv=taxonomy_tsv, blastdb_dir_path=blasdb_dir_path,
                                  blastdbname_str=blastdbname_str, params=params, num_threads=num_threads)

        ############################################################################################
        #
        # Subcommand: pool
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'pool':
            db = arg_parser_dic['db']
            readcounts = arg_parser_dic['readcounts']
            run_marker_tsv = arg_parser_dic['runmarker']
            pooled_marker_tsv = arg_parser_dic['asvtable']
            params = arg_parser_dic['params']
            CommandPoolRunMarkers.main(db=db, pooled_marker_tsv=pooled_marker_tsv,
                run_marker_tsv=run_marker_tsv, params=params, readcounts=readcounts)

        ############################################################################################
        #
        # Subcommand: taxonomy
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'taxonomy':
            taxonomy_tsv = arg_parser_dic['output']
            precomputed = arg_parser_dic['precomputed']
            taxonomy = CommandTaxonomy(taxonomy_tsv=taxonomy_tsv)
            taxonomy.main(precomputed=precomputed)

        ############################################################################################
        #
        # Subcommand: coi blast
        #
        ############################################################################################

        elif arg_parser_dic['command'] == 'coi_blast_db':
            blastdbdir = arg_parser_dic['blastdbdir']
            blastdbname = arg_parser_dic['blastdbname']
            coi_blast_db = CommandBlastCOI(blastdbname=blastdbname)
            coi_blast_db.download(blastdbdir=blastdbdir)

        ############################################################################################
        #
        # Else: run_name usage message
        #
        ############################################################################################

        else:
            self.args = parser.parse_args(['--help'])  # if command unknown print help
예제 #17
0
    def get_variant_read_count_delete_df(self, variant_df,
                                         uchime3_denovo_abskew):

        temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                os.path.basename(__file__))
        pathlib.Path(temp_dir).mkdir(exist_ok=True)

        filter_output_chimera_df = self.variant_read_count_df.copy()
        filter_output_chimera_df['filter_delete'] = False
        #
        filter_output_borderline_df = self.variant_read_count_df.copy()
        filter_output_borderline_df['filter_delete'] = False

        run_marker_sample_df = self.variant_read_count_df[[
            'run_id', 'marker_id', 'sample_id'
        ]].drop_duplicates(inplace=False)
        for row in run_marker_sample_df.itertuples():
            run_id = row.run_id
            marker_id = row.marker_id
            sample_id = row.sample_id

            variant_read_count_df = self.variant_read_count_df.loc[
                (self.variant_read_count_df.run_id == run_id)
                & (self.variant_read_count_df.marker_id == marker_id) &
                (self.variant_read_count_df.sample_id == sample_id)]

            variant_read_count_df_obj = DataframeVariantReadCountLike(
                variant_read_count_df=variant_read_count_df)
            N_i_df = variant_read_count_df_obj.get_N_i_df()

            variant_size_df = variant_df.merge(N_i_df,
                                               left_index=True,
                                               right_on='variant_id')
            variant_size_df = variant_size_df[[
                'variant_id', 'sequence', 'N_i'
            ]]
            variant_size_df.rename(columns={'N_i': 'size'}, inplace=True)
            variant_size_df.set_index('variant_id', inplace=True)

            ###################################################################
            #
            # Sort variants by abundance and write to fasta_path
            #
            ###################################################################

            variant_size_df.sort_values(by='size',
                                        ascending=False,
                                        inplace=True)

            variant_df_utils_obj = DataframeVariant(variant_size_df)

            uchime_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}.fasta'.format(
                    run_id, marker_id, sample_id))
            variant_df_utils_obj.to_fasta(fasta_path=uchime_fasta_path,
                                          add_column="size")

            ###################################################################
            #
            # Run uchime_denovo
            #
            ###################################################################

            uchime_borderline_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}_borderline.fasta'.format(
                    run_id, marker_id, sample_id))
            uchime_nonchimeras_fasta_path = os.path.join(
                temp_dir,
                'run_{}_marker_{}_sample_id_{}_nonchimeras.fasta'.format(
                    run_id, marker_id, sample_id))
            uchime_chimeras_fasta_path = os.path.join(
                temp_dir, 'run_{}_marker_{}_sample_{}_chimeras.fasta'.format(
                    run_id, marker_id, sample_id))

            #
            # Create object and run_name vsearch
            vsearch_parameters = {
                'uchime3_denovo': uchime_fasta_path,
                'borderline': uchime_borderline_fasta_path,
                'nonchimeras': uchime_nonchimeras_fasta_path,
                'chimeras': uchime_chimeras_fasta_path,
                'abskew': uchime3_denovo_abskew,
            }
            vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters)
            vsearch_cluster.run()

            ###################################################################
            #
            # 4. Delete variant from replicate/sample if chimeras
            #
            ###################################################################

            Logger.instance().debug(
                "Vsearch uchime chimera tsv_path: {}".format(
                    uchime_chimeras_fasta_path))
            with open(uchime_chimeras_fasta_path, "r") as handle:
                for chimera_seqrecord in SeqIO.parse(handle, "fasta"):
                    variant_id = int(chimera_seqrecord.id.split(';')[0])
                    filter_output_chimera_df.loc[
                        (filter_output_chimera_df['run_id'] == run_id)
                        & (filter_output_chimera_df['marker_id'] == marker_id)
                        & (filter_output_chimera_df['sample_id'] == sample_id)
                        &
                        (filter_output_chimera_df['variant_id'] == variant_id),
                        'filter_delete'] = True

            Logger.instance().debug(
                "Vsearch uchime chimera borderline tsv_path: {}".format(
                    uchime_borderline_fasta_path))
            with open(uchime_borderline_fasta_path, "r") as handle:
                for chimera_seqrecord in SeqIO.parse(handle, "fasta"):
                    variant_id = int(chimera_seqrecord.id.split(';')[0])
                    filter_output_borderline_df.loc[
                        (filter_output_borderline_df['run_id'] == run_id)
                        &
                        (filter_output_borderline_df['marker_id'] == marker_id)
                        &
                        (filter_output_borderline_df['sample_id'] == sample_id)
                        & (filter_output_borderline_df['variant_id'] ==
                           variant_id), 'filter_delete'] = True

        return filter_output_chimera_df, filter_output_borderline_df
예제 #18
0
    def main(cls,
             db,
             mode,
             asvtable_tsv,
             output,
             taxonomy_tsv,
             blastdb_dir_path,
             blastdbname_str,
             num_threads=multiprocessing.cpu_count(),
             params=None):
        """

        Parameters
        ----------
        db: str
            Path to SQLITE database with Variant and Taxassign tables
        mode
        asvtable_tsv
        output
        taxonomy_tsv
        blastdb_dir_path
        blastdbname_str
        num_threads
        params

        Returns
        -------

        """

        this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                     os.path.basename(__file__))
        pathlib.Path(this_temp_dir).mkdir(exist_ok=True)

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        # params_dic = constants.get_params_default_dic()
        # params_dic = FileParams(params).get_params_dic()

        # ltg_rule_threshold = params_dic['ltg_rule_threshold']
        # include_prop = params_dic['include_prop']
        # min_number_of_taxa = params_dic['min_number_of_taxa']
        # qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # Load db and tables as classes and delete taxassign in reset mode
        #
        #######################################################################

        engine = sqlalchemy.create_engine('sqlite:///{}'.format(db),
                                          echo=False)

        variant_declarative_table = Variant.__table__
        variant_declarative_table.create(bind=engine, checkfirst=True)
        tax_assign_declarative_table = TaxAssign.__table__
        tax_assign_declarative_table.create(bind=engine, checkfirst=True)

        if mode == 'reset':
            with engine.connect() as conn:
                conn.execute(tax_assign_declarative_table.delete())

        #######################################################################
        #
        # Use variants that are not already already assigned in TaxAssign
        #
        #######################################################################

        variant_input_df = pandas.read_csv(asvtable_tsv, sep="\t", header=0)
        # get list of variant sequences
        variant_sequence_list = variant_input_df.sequence.tolist()

        # Add variant to DB if not already there
        for variant_sequence in variant_sequence_list:
            with engine.connect() as conn:
                row_variant = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()
                if row_variant is None:  # variant_sequence IS NOT in the database, so INSERT it
                    conn.execute(variant_declarative_table.insert().values(
                        sequence=variant_sequence))

        #######################################################################
        #
        # Get already tax-assigned variants with all informations including sequence
        #
        #######################################################################

        stmt_variant_tax_assign = sqlalchemy.select([
            tax_assign_declarative_table.c.variant_id,
            tax_assign_declarative_table.c.identity,
            tax_assign_declarative_table.c.ltg_rank,
            tax_assign_declarative_table.c.ltg_tax_id,
            tax_assign_declarative_table.c.ltg_tax_name,
            tax_assign_declarative_table.c.blast_db,
            variant_declarative_table.c.sequence,
        ])\
            .where(tax_assign_declarative_table.c.ltg_tax_id.isnot(None))\
            .where(tax_assign_declarative_table.c.variant_id == variant_declarative_table.c.id)\
            .where(variant_declarative_table.c.sequence.in_(variant_sequence_list))\
            .distinct()

        # These are the variants that are already in taxassign and do not need
        # recalculate
        ltg_from_db_list = []
        with engine.connect() as conn:
            for row in conn.execute(stmt_variant_tax_assign).fetchall():
                ltg_from_db_list.append(dict(zip(row.keys(), row.values())))
        """(Pdb) pandas.DataFrame.from_records(ltg_from_db_list)
   identity ltg_rank  ltg_tax_id              ltg_tax_name                                           sequence  variant_id
0       100  species     2028017  Orthocladiinae sp. BAP34  AGCATGATCTGGAATAGTAGGTACTTCCCTTAGTATCTTAATTCGA...         325
1        99  species     2028029   Rheocricotopus sp. DH90  GGCTTGATCCGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA...        1203
2       100  species     1592914            Caenis pusilla  GGCTTGATCCGGAATGCTGGGCACCTCTCTAAGCCTTCTAATTCGT...        1443
3       100  species     2028029   Rheocricotopus sp. DH90  TGCTTGATCAGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA...        2298
4        90   family        7149              Chironomidae  TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGA...        2498
5       100  species      189839            Baetis rhodani  TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGA...        2610"""
        ltg_db_df = pandas.DataFrame.from_records(ltg_from_db_list)
        ltg_db_df = ltg_db_df.reindex(sorted(ltg_db_df.columns),
                                      axis=1)  # sort columns

        #######################################################################
        #
        # Get list of variants (id and sequence) that need blast for tax assignation
        #
        #######################################################################

        stmt_variant = sqlalchemy.select([variant_declarative_table.c.id, variant_declarative_table.c.sequence]) \
            .where(variant_declarative_table.c.sequence.in_(variant_sequence_list)) \

        if ltg_db_df.shape[0] > 0:
            stmt_variant = stmt_variant.where(
                variant_declarative_table.c.id.notin_(
                    ltg_db_df.variant_id.tolist()))
        stmt_variant = stmt_variant.distinct().order_by("id")

        variant_not_tax_assigned = []
        with engine.connect() as conn:
            for row in conn.execute(stmt_variant).fetchall():
                variant_not_tax_assigned.append(
                    dict(zip(row.keys(), row.values())))

        #######################################################################
        #
        # Run RunnerTaxAssign for variant_not_tax_assigned
        #
        #######################################################################

        blast_variant_df = pandas.DataFrame()
        ltg_blast_df = pandas.DataFrame()

        if len(variant_not_tax_assigned
               ) > 0:  # Run blast for variants that need tax assignation

            blast_variant_df = pandas.DataFrame.from_records(
                variant_not_tax_assigned, index='id')
            taxonomy = Taxonomy(tsv=taxonomy_tsv)
            sequence_list = blast_variant_df.sequence.tolist()
            tax_assign_runner = RunnerTaxAssign(sequence_list=sequence_list,
                                                taxonomy=taxonomy,
                                                blast_db_dir=blastdb_dir_path,
                                                blast_db_name=blastdbname_str,
                                                num_threads=num_threads,
                                                params=None)
            ltg_blast_df = tax_assign_runner.ltg_df

            ######################################################
            # Uncomment to debug because blast is slow
            # pandas.to_pickle(ltg_df, "ltg_df.pkl")
            # ltg_df = pandas.read_pickle("ltg_df.pkl")
            # import pdb; pdb.set_trace()
            ######################################################

            ltg_blast_df.rename({'variant_id': 'sequence'},
                                inplace=True,
                                axis=1)

            ltg_blast_df = blast_variant_df.merge(ltg_blast_df,
                                                  on='sequence',
                                                  how='outer')

            ltg_blast_df['blast_db'] = blastdbname_str

            ltg_blast_df = ltg_blast_df.reindex(sorted(ltg_blast_df.columns),
                                                axis=1)  # sort columns
        del (blast_variant_df)

        #######################################################################
        #
        # Concatenate tax-assigned variants from DB and from Blast
        # Merge variant_df and ltg_df and write to DB
        #
        #######################################################################

        if ltg_db_df.shape[0] > 0 and ltg_blast_df.shape[0] > 0:
            ltg_df = pandas.concat([
                ltg_db_df[[
                    "blast_db", "identity", "ltg_rank", "ltg_tax_id",
                    "ltg_tax_name", "sequence"
                ]], ltg_blast_df
            ],
                                   axis=0)
        elif ltg_db_df.shape[0] > 0:
            ltg_df = ltg_db_df.copy()
        elif ltg_blast_df.shape[0] > 0:
            ltg_df = ltg_blast_df.copy()
        del (ltg_blast_df)

        #######################################################################
        #
        # Insert or update variant and taxassign tables
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Insert variant_id, ltg_tax_id, ltg_rank to DB"
            .format(__file__,
                    inspect.currentframe().f_lineno))

        for ltg_row in ltg_df.itertuples():
            variant_sequence = ltg_row.sequence
            with engine.connect() as conn:
                variant_id = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()[0]
                select_row = conn.execute(
                    sqlalchemy.select([
                        TaxAssign
                    ]).where(tax_assign_declarative_table.c.variant_id ==
                             variant_id)).first()
                # import pdb; pdb.set_trace()
                if select_row is None:  # variant_id IS NOT in the database, so INSERT it
                    ltg_row_dic = ltg_row._asdict()
                    ltg_row_dic['variant_id'] = variant_id
                    conn.execute(tax_assign_declarative_table.insert(),
                                 dict(ltg_row_dic))
                else:  # variant_sequence IS in the database, so update row
                    tax_assign_declarative_table.update().where(
                        tax_assign_declarative_table.c.variant_id ==
                        variant_id).values()

        #######################################################################
        #
        # Update LTGs for variant output file
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Update LTGs for variant output file".format(
                __file__,
                inspect.currentframe().f_lineno))

        variant_output_df = variant_input_df.copy()
        del (variant_input_df)
        # Add ltg columns to variant_df if it do not exist
        for ltg_df_col in [
                'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity',
                'blast_db'
        ]:
            if not (ltg_df_col in variant_output_df.columns):
                variant_output_df[ltg_df_col] = None
        # Move sequence column to end
        variant_df_columns = variant_output_df.columns.tolist()
        variant_df_columns.append(
            variant_df_columns.pop(variant_df_columns.index('sequence')))
        variant_output_df = variant_output_df[variant_df_columns]

        for variant_row in variant_output_df.itertuples():
            # variant_id = variant_row.variant_id
            variant_sequence = variant_row.sequence
            with engine.connect() as conn:
                variant_id = conn.execute(
                    sqlalchemy.select([
                        variant_declarative_table.c.id
                    ]).where(variant_declarative_table.c.sequence ==
                             variant_sequence)).first()[0]
                select_row = conn.execute(
                    sqlalchemy.select([
                        TaxAssign.ltg_tax_id,
                        TaxAssign.ltg_tax_name,
                        TaxAssign.ltg_rank,
                        TaxAssign.identity,
                        TaxAssign.blast_db,
                    ]).where(tax_assign_declarative_table.c.variant_id ==
                             variant_id)).first()
            tax_assign_dict = dict(
                zip([
                    'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity',
                    'blast_db'
                ], select_row))
            for k in tax_assign_dict:
                variant_output_df.loc[variant_output_df.sequence ==
                                      variant_sequence, k] = tax_assign_dict[k]
        # do not move. required because sometimes tax_id is none
        variant_output_df = variant_output_df.astype({'ltg_tax_id': 'object'})

        #######################################################################
        #
        # Update tax lineages for variant output file
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Update tax lineages for variant output file".
            format(__file__,
                   inspect.currentframe().f_lineno))

        tax_id_list = variant_output_df.ltg_tax_id.unique().tolist(
        )  # unique list of tax ids
        tax_lineage = TaxLineage(taxonomic_tsv_path=taxonomy_tsv)
        tax_lineage_df = tax_lineage.create_lineage_from_tax_id_list(
            tax_id_list=tax_id_list, tax_name=True)

        # Merge
        variant_output_df = variant_output_df.merge(tax_lineage_df,
                                                    left_on='ltg_tax_id',
                                                    right_on='tax_id',
                                                    how='left')
        variant_output_df.drop('tax_id', axis=1, inplace=True)

        Logger.instance().debug("file: {}; line: {}; Reorder columns".format(
            __file__,
            inspect.currentframe().f_lineno))
        # Move sequence column to end
        variant_df_columns = variant_output_df.columns.tolist()
        variant_df_columns.append(
            variant_df_columns.pop(variant_df_columns.index('sequence')))
        variant_output_df = variant_output_df[variant_df_columns]
        Logger.instance().debug("file: {}; line: {}; Write to TSV".format(
            __file__,
            inspect.currentframe().f_lineno))
        variant_output_df.to_csv(output, sep='\t', index=False, header=True)
예제 #19
0
    def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), 
        no_reverse=False, tag_to_end=False, primer_to_end=False):
        
        Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}")

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        
        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        merged_fasta_list = []
        results_list = []
        sample_info = {}

        # make sure every file is analysed once.
        for i in range(merged_fastainfo_df.shape[0]):
            if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list:
                merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta)
            
        for mergedfasta in merged_fasta_list:

            inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end)
            
            tagFile_path = inputFiles.tags_file()
            info = inputFiles.get_df_info()

            for key in info.keys():
                if key in sample_info.keys():
                    sample_info[key] = sample_info[key] + info[key]
                else:
                    sample_info[key] = info[key]

            Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta))

            in_raw_fasta_path = os.path.join(fastadir, mergedfasta)

            ########################################################################################
            #
            #   cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile 
            #   --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path
            #
            ########################################################################################

            base = os.path.basename(in_raw_fasta_path)
            base, base_suffix = base.split('.', 1)
            
            out_fasta_path = os.path.join(tempdir, "sorted") 

            cmd_cutadapt_tag_dic = {
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
                'tagFile': tagFile_path,
                'base_suffix': base_suffix,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \
                .format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            Logger.instance().info(run_result.stdout.decode())

            inputFiles.remove_tags_file()

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only 
            # --minimum-length minimum_length --maximum-length maximum_length 
            # --output input_path + {name} + suffix outputfile
            #
            ########################################################################################
            
            primers = inputFiles.primers()
            try:
                tags_samples = inputFiles.get_sample_names()
            except Exception as e:
                Logger.instance().error(e)
                return 
            
            for primer in primers:
                
                marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer

                for tag_sample in tags_samples:

                    name, run, marker2, sample, replicate, _, _ = tag_sample
                    
                    if marker not in marker2:
                        continue

                    in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix

                    baseMerge =  mergedfasta.split(".")[0]
                                        
                    outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed"
                    if name.endswith("_reversed"):
                        outname = outname + "_reversed"
                    out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix)

                    results_list.append(out_fasta_path_new)
                    
                    if not "_reversed" in name:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerrev, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerrev).reverse_complement())
                        primerFwd = primerfwd
                        lenPrimerFwd = lenprimerfwd
                        lenPrimerRev = lenprimerrev
                    else:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerfwd, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerfwd).reverse_complement())
                        primerFwd = primerrev
                        lenPrimerFwd = lenprimerrev
                        lenPrimerRev = lenprimerfwd


                    cmd_cutadapt_primer_dic = {
                        'in_fasta_path': in_fasta_path,
                        'out_fasta': out_fasta_path_new,
                        'error_rate': cutadapt_error_rate,
                        'num_threads': num_threads,
                        'primerFwd': primerFwd,
                        'primerRev': primerRev,
                        'lenPrimerFwd': lenPrimerFwd,
                        'lenPrimerRev': lenPrimerRev,
                        'read_min_length': cutadapt_minimum_length,
                        'read_max_length': cutadapt_maximum_length,
                    }

                    if not primer_to_end: #works if the command is selected
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)
                    else:
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\
                            '--output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)

                    Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str))

                    if sys.platform.startswith("win"):
                        args = cmd_cutadapt_primer_str
                    else:
                        args = shlex.split(cmd_cutadapt_primer_str)

                    run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

                    Logger.instance().info(run_result.stdout.decode())

        ###################################################################
        #
        # Reverse complement back rc fasta and pool
        #
        ###################################################################   
     
        for file in results_list:
            if "_trimmed" in file:

                out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1])
                in_fasta_path = os.path.join(tempdir, file)

                if out_final_fasta_path.endswith(".gz"):      
                    _open = partial(gzip.open) 
                elif out_final_fasta_path.endswith(".bz2"):
                    _open = partial(bz2.open)
                else:
                    _open = open

                if in_fasta_path.endswith(".gz"):
                    _open2 = partial(gzip.open) 
                elif in_fasta_path.endswith(".bz2"):
                    _open2 = partial(bz2.open) 
                else: 
                    _open2 = open

                if "_reversed" in file:
                    Logger.instance().debug("Pooling fwd and rc reads...")

                    out_final_fasta_path = out_final_fasta_path.replace("_reversed", "")

                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                if not line.startswith('>'):
                                    if generic_dna:  # Biopython <1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip(), generic_dna).reverse_complement()))
                                    else:  # Biopython =>1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip()).reverse_complement()))

                                else:
                                    fout.write(line)
                else:
                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                fout.write(line)
        
        results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result]

        del sample_info['mergedfasta']
        del sample_info['primerrev']
        del sample_info['primerfwd']
        del sample_info['tagrev']
        del sample_info['tagfwd']

        sample_info['sortedfasta'] = results_list

        sample_info_df = pandas.DataFrame(sample_info)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)