Пример #1
0
    def main(cls, db, pooled_marker_tsv, run_marker_tsv, params, readcounts):
        """

        Parameters
        ----------
        db : str
            path to DB in sqlite format
        pooled_marker_tsv : str
            path to output pooled_marker.tsv file
        run_marker_tsv : str
            path to input run_marker.tsv file
        params
        readcounts: bool
            Output absence/presence (False) or sum of read counts (True)

        Returns
        -------

        """

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        # params_dic = constants.get_params_default_dic()
        params_dic = FileParams(params).get_params_dic()

        cluster_identity = params_dic['cluster_identity']

        run_marker_file_obj = FileRunMarker(tsv_path=run_marker_tsv)

        # run_marker_tsv_reader = RunMarkerTSVreader(db=db, run_marker_tsv_path=run_marker_tsv)
        if not (run_marker_tsv is None):
            run_marker_df = run_marker_file_obj.read_tsv_into_df()
        else:
            run_marker_df = None

        engine = sqlalchemy.create_engine('sqlite:///{}'.format(db),
                                          echo=False)
        Base = automap_base()
        Base.prepare(engine, reflect=True)

        sample_list = run_marker_file_obj.get_sample_ids(engine)
        sample_list = NameIdConverter(id_name_or_sequence_list=sample_list,
                                      engine=engine).to_names(Sample)

        ############################################################################################
        #
        # Compute all variant_read_count_input_df required for ASV table
        #
        ############################################################################################

        variant_read_count_df = run_marker_file_obj.get_variant_read_count_df(
            engine=engine, variant_read_count_like_model=FilterCodonStop)

        asv_table_runner = RunnerAsvTable(
            variant_read_count_df=variant_read_count_df,
            engine=engine,
            sample_list=sample_list,
            cluster_identity=cluster_identity)
        asv_table_df = asv_table_runner.create_asvtable_df()
        asv_table_df.rename(
            {
                'run': 'run_name',
                'marker': 'marker_name',
                'variant': 'variant_id'
            },
            axis=1,
            inplace=True)

        ############################################################################################
        #
        # Prefix biosample columns with run name for same biosample name in different runs
        #
        ############################################################################################

        asv_table_2_df = asv_table_df.copy()

        for run_name_i, run_name in enumerate(asv_table_df.run_name.unique()):
            asv_table_runi_df = (
                asv_table_df.loc[asv_table_df.run_name == run_name]).copy()

            for biosample in asv_table_runi_df.iloc[:, 5:-4].columns.tolist():
                asv_table_runi_df.rename(
                    {biosample: run_name + '-' + biosample},
                    axis=1,
                    inplace=True)

            if run_name_i == 0:
                asv_table_2_df = asv_table_runi_df
            else:

                asv_table_2_df = pandas.concat([
                    asv_table_2_df,
                    pandas.DataFrame(columns=asv_table_runi_df.columns)
                ])
                asv_table_2_df = asv_table_2_df.fillna(0)
                asv_table_2_df = pandas.concat(
                    [asv_table_2_df, asv_table_runi_df], axis=0, join='outer')

            del (asv_table_runi_df)

        ############################################################################################
        #
        # Reorder columns
        #
        ############################################################################################

        column_list = asv_table_2_df.columns.tolist()
        column_list.remove("run_name")
        column_list.insert(0, "run_name")

        column_list.remove("clusterid")
        column_list.remove("clustersize")
        column_list.remove("chimera_borderline")
        column_list.remove("sequence")
        column_list = column_list + [
            'clusterid', 'clustersize', 'chimera_borderline', 'sequence'
        ]

        column_list.remove("sequence_length")
        column_list.remove("read_count")
        column_list.insert(3, "sequence_length")
        column_list.insert(4, "read_count")

        asv_table_2_df = asv_table_2_df[column_list]

        ############################################################################################
        #
        # Pool markers
        #
        ############################################################################################

        pool_marker_runner = CommandPoolRunMarkers(asv_table_df=asv_table_2_df,
                                                   run_marker_df=run_marker_df,
                                                   readcounts=readcounts)
        pooled_marker_df = pool_marker_runner.get_pooled_marker_df()

        #######################################################################
        #
        # Cluster sequences
        #
        #######################################################################

        # reset asvtable-based clusterid and clustersize
        pooled_marker_df.drop(['clusterid', 'clustersize'],
                              axis=1,
                              inplace=True)
        pooled_marker_df.rename({'variant': 'variant_id'},
                                axis=1,
                                inplace=True)  # prepare
        pooled_marker_df['read_count'] = pooled_marker_df.iloc[:, 4:-2].sum(
            axis=1)  # prepare

        seq_clusterer_obj = SequenceClusterer(
            pooled_marker_df, cluster_identity=cluster_identity)
        cluster_count_df = seq_clusterer_obj.compute_clusters()

        pooled_marker_df = pooled_marker_df.merge(cluster_count_df,
                                                  on='variant_id')
        pooled_marker_df.drop(['read_count'], axis=1, inplace=True)
        ############################################################################################
        #
        # Reorder columns
        #
        ############################################################################################

        column_list = pooled_marker_df.columns.tolist()
        column_list.remove("pooled_sequences")
        column_list.remove("sequence")
        column_list = column_list + ['pooled_sequences', 'sequence']
        pooled_marker_df = pooled_marker_df[column_list]

        # change dtypes
        for col in pooled_marker_df.columns[4:-4]:
            pooled_marker_df[col] = pooled_marker_df[col].astype(int)

        # verify here if the run-sample exists in the sampleinformation database
        # and drop if not
        run_biosample_cols = pooled_marker_df.columns[4:-4]
        # run_biosample_item = run_biosample_cols[0]
        from sqlalchemy.orm import sessionmaker
        Session = sessionmaker(bind=engine)
        session = Session()
        for run_biosample_item in run_biosample_cols:
            thisrun, thisbiosample = run_biosample_item.split('-')
            rowcount = session.query(
                SampleInformation, Sample,
                Run).filter(SampleInformation.sample_id == Sample.id).filter(
                    SampleInformation.run_id == Run.id).filter(
                        Run.name == thisrun).filter(
                            Sample.name == thisbiosample).count()
            if rowcount <= 0:
                pooled_marker_df.drop([run_biosample_item],
                                      axis=1,
                                      inplace=True)

        #######################################################################
        #
        # To tsv
        #
        #######################################################################

        pooled_marker_df.to_csv(pooled_marker_tsv, sep="\t", index=False)
Пример #2
0
    def main(fastainfo,
             fastadir,
             sorteddir,
             params=None,
             num_threads=multiprocessing.cpu_count()):

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(
            fastainfo).read_tsv_into_df()

        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        sorted_read_info_df = pandas.DataFrame()

        for i in range(0, merged_fastainfo_df.shape[0]):
            fasta_info_series = merged_fastainfo_df.iloc[i]

            tag_fwd = fasta_info_series.tagfwd
            tag_rev = fasta_info_series.tagrev
            primer_fwd = fasta_info_series.primerfwd
            primer_rev = fasta_info_series.primerrev
            in_fasta_basename = fasta_info_series.mergedfasta

            Logger.instance().debug(
                "Analysing FASTA file: {}".format(in_fasta_basename))

            fasta_info_df_i = fasta_info_series.to_frame().T
            in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename)

            ########################################################################################
            #
            # Cut adapt tag of forward reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_rev_rc = str(
                    Seq(tag_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_rev_rc = str(Seq(tag_rev).reverse_complement())

            out_fasta_basename = os.path.basename(in_raw_fasta_path).replace(
                '.fasta', '_sorted_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_fwd,
                'tag_fwd_len': len(tag_fwd),
                'tag_rev_rc': tag_rev_rc,
                'tag_rev_rc_len': len(tag_rev_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args,
                                        capture_output=True,
                                        check=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                primer_rev_rc = str(
                    Seq(primer_rev, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_rev_rc = str(Seq(primer_rev).reverse_complement())

            in_fasta_path = out_fasta_path
            out_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i)
            out_fasta_path = os.path.join(tempdir, out_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_fwd,
                'primer_fwd_len': len(primer_fwd),
                'primer_rev_rc': primer_rev_rc,
                'primer_rev_rc_len': len(primer_rev_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }

            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                                      '--minimum-length {read_min_length} ' \
                                      '--maximum-length {read_max_length} --trimmed-only  ' \
                                      '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" '  \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ########################################################################################
            #
            # Cut adapt tag of reverse-complement reads
            # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only
            # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta
            #
            ########################################################################################

            if generic_dna:  # Biopython <1.78
                tag_fwd_rc = str(
                    Seq(tag_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                tag_fwd_rc = str(Seq(tag_fwd).reverse_complement())

            out_rc_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta',
                                           '_rc_sorted_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_tag_dic = {
                'tag_fwd': tag_rev,
                'tag_fwd_len': len(tag_rev),
                'tag_rev_rc': tag_fwd_rc,
                'tag_rev_rc_len': len(tag_fwd_rc),
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'num_threads': num_threads,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Trim primers from output
            # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only
            # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26'
            # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta
            # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta
            #
            ###################################################################

            if generic_dna:  # Biopython <1.78
                primer_fwd_rc = str(
                    Seq(primer_fwd, generic_dna).reverse_complement())
            else:  # Biopython =>1.78
                primer_fwd_rc = str(Seq(primer_fwd).reverse_complement())

            in_fasta_path = out_rc_fasta_path
            out_rc_fasta_basename = os.path.basename(in_fasta_path).replace(
                '_rc_sorted_%03d.fasta' % i,
                '_rc_sorted_trimmed_%03d.fasta' % i)
            out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename)

            cmd_cutadapt_primer_dic = {
                'primer_fwd': primer_rev,
                'primer_fwd_len': len(primer_rev),
                'primer_rev_rc': primer_fwd_rc,
                'primer_rev_rc_len': len(primer_fwd_rc),
                'in_fasta_path': in_fasta_path,
                'out_fasta': out_rc_fasta_path,
                'error_rate': cutadapt_error_rate,
                'read_min_length': cutadapt_minimum_length,
                'read_max_length': cutadapt_maximum_length,
                'num_threads': num_threads,
            }
            cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                '--minimum-length {read_min_length} ' \
                '--maximum-length {read_max_length} --trimmed-only  ' \
                '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \
                '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic)

            Logger.instance().debug(
                "Running: {}".format(cmd_cutadapt_primer_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_primer_str
            else:
                args = shlex.split(cmd_cutadapt_primer_str)
            run_result = subprocess.run(args=args, capture_output=True)

            Logger.instance().info(run_result.stdout.decode())
            Logger.instance().info(run_result.stderr.decode())

            ###################################################################
            #
            # Reverse complement back rc fasta and pool
            #
            ###################################################################

            out_final_fasta_basename = os.path.basename(
                in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i)
            out_final_fasta_path = os.path.join(sorteddir,
                                                out_final_fasta_basename)
            shutil.copy(out_fasta_path, out_final_fasta_path)

            Logger.instance().debug("Pooling fwd and rc reads...")
            with open(out_final_fasta_path, 'a') as fout:
                with open(out_rc_fasta_path, 'r') as fin:
                    for line in fin:
                        if not line.startswith('>'):

                            if generic_dna:  # Biopython <1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip(),
                                        generic_dna).reverse_complement()))
                            else:  # Biopython =>1.78
                                fout.write("%s\n" % str(
                                    Seq(line.strip()).reverse_complement()))

                        else:
                            fout.write(line)

            fasta_info_df_i = fasta_info_df_i[[
                'run', 'marker', 'sample', 'replicate'
            ]]
            fasta_info_df_i['sortedfasta'] = out_final_fasta_basename
            sorted_read_info_df = pandas.concat(
                [sorted_read_info_df, fasta_info_df_i], axis=0)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sorted_read_info_df.to_csv(fasta_trimmed_info_tsv,
                                   sep="\t",
                                   header=True,
                                   index=False)
Пример #3
0
    def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name,
                 num_threads, params):
        """

        Parameters
        ----------
        sequence_list : list
            List of se
        param2 : str
            The second parameter.

        """

        self.old_tax_id_df = taxonomy.old_tax_df
        self.taxonomy_df = taxonomy.df
        self.blast_db_dir = blast_db_dir
        self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(),
                                          os.path.basename(__file__))
        pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True)

        self.num_threads = num_threads

        #######################################################################
        #
        # Parameters
        #
        #######################################################################

        params_dic = FileParams(params).get_params_dic()
        qcov_hsp_perc = params_dic['qcov_hsp_perc']

        #######################################################################
        #
        # 2 Create FASTA file with Variants
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Create SortedReadFile from Variants".format(
                __file__,
                inspect.currentframe().f_lineno))
        variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta')
        with open(variant_fasta, 'w') as fout:
            for seq in sequence_list:
                fout.write(">{}\n{}\n".format(seq, seq))

        #######################################################################
        #
        # 3 Run local blast
        #
        #######################################################################

        runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name,
                                   num_threads, qcov_hsp_perc)
        # run blast
        blast_output_tsv = runner_blast.run_local_blast()
        # process blast results
        blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv)

        #######################################################################
        #
        # Compute tax lineages for Blast target tax ids
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Open taxonomy.tsv DB".format(
                __file__,
                inspect.currentframe().f_lineno))
        blast_output_df.target_tax_id = pandas.to_numeric(
            blast_output_df.target_tax_id)
        #
        Logger.instance().debug(
            "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        tax_id_list = blast_output_df.target_tax_id.unique().tolist()
        tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages(
            tax_id_list)

        #######################################################################
        #
        # Merge tax lineages and the blast result
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Merge blast result including tax_id with their lineages"
            .format(__file__,
                    inspect.currentframe().f_lineno))
        # Merge local blast output with tax_id_to_lineage_df
        # variant_identity_lineage_df = blast_output_df.merge(
        #     tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id')
        variantid_identity_lineage_df = blast_output_df.merge(
            tax_id_to_lineage_df, left_on='target_tax_id', right_index=True)
        # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True)
        """(Pdb) variant_identity_lineage_df.columns  
Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage',
       'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order',
       'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom',
       'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass',
       'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder',
       'superorder', 'subcohort', 'superclass', 'species group', 'subtribe',
       'section', 'varietas', 'species subgroup'],
      dtype='object')"""

        #######################################################################
        #
        #  several_variants_to_ltg
        # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant
        #
        #######################################################################

        Logger.instance().debug(
            "file: {}; line: {}; Main loop over variant and identity to"
            "compute the whole set of ltg_tax_id and ltg_rank for each variant_id"
            "to a dataframe".format(__file__,
                                    inspect.currentframe().f_lineno))
        runner_ltg_selection = RunnerLTGselection(
            variant_identity_lineage_df=variantid_identity_lineage_df,
            taxonomy_df=self.taxonomy_df,
            params=params)
        self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
Пример #4
0
    def main(fastqinfo, fastqdir, fastainfo, fastadir, params=None, num_threads=multiprocessing.cpu_count()):
        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        ############################################################################################
        #
        # Read fastq information into stats_df
        #
        ############################################################################################

        fastqinfo_df = FileSampleInformation(fastqinfo).read_tsv_into_df()

        pathlib.Path(
            os.path.dirname(fastainfo)).mkdir(
            parents=True,
            exist_ok=True)
        pathlib.Path(fastadir).mkdir(parents=True, exist_ok=True)

        fastainfo_df = pandas.DataFrame()

        ############################################################################################
        #
        # Loop over fastq pairs to merge
        #
        ############################################################################################

        # File with analysis stats data
        stats_df = pandas.DataFrame({'FastqFwd': [], 'FastqRev': [], 'NbReadsFwd': [], 'NbReadsRev': [], 'FastaMerged': [], 'NbMergedReads': []})

        for fastqfwd, fastqrev in fastqinfo_df[[
                'fastqfwd', 'fastqrev']].drop_duplicates().values:

            fastq_info_df_i = fastqinfo_df.loc[(fastqinfo_df.fastqfwd == fastqfwd) & (
                fastqinfo_df.fastqrev == fastqrev)]

            fastq_fw_abspath = os.path.join(fastqdir, fastqfwd)
            with open(fastq_fw_abspath, 'rb') as fin:
                fastq_fw_linecount = int(sum(1 for i in fin.read())/4)

            fastq_rv_abspath = os.path.join(fastqdir, fastqrev)
            with open(fastq_rv_abspath, 'rb') as fin:
                fastq_rv_linecount = int(sum(1 for i in fin.read())/4)

            Logger.instance().debug(
                "Analysing FASTQ files: {} and ".format(
                    fastqfwd, fastqrev))

            try:
                pathlib.Path(fastq_fw_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_fw_abspath)))
                sys.exit(1)
            try:
                pathlib.Path(fastq_rv_abspath).resolve(strict=True)
            except FileNotFoundError:
                Logger.instance().error(
                    VTAMexception(
                        "VTAMexception: This FASTQ file was not found: {}.".format(fastq_rv_abspath)))
                sys.exit(1)

            fasta_merged_basename = os.path.basename(
                fastq_fw_abspath).replace('.fastq', '.fasta')
            out_fasta_path = os.path.join(fastadir, fasta_merged_basename)

            ########################################################################################
            #
            # Run vsearch merge
            #
            ########################################################################################

            vsearch_args_dic = {}

            vsearch_args_dic['fastq_ascii'] = params_dic['fastq_ascii']
            vsearch_args_dic['fastq_maxee'] = params_dic['fastq_maxee']
            vsearch_args_dic['fastq_maxmergelen'] = params_dic['fastq_maxmergelen']
            vsearch_args_dic['fastq_maxns'] = params_dic['fastq_maxns']
            vsearch_args_dic['fastq_minlen'] = params_dic['fastq_minlen']
            vsearch_args_dic['fastq_minmergelen'] = params_dic['fastq_minmergelen']
            vsearch_args_dic['fastq_minovlen'] = params_dic['fastq_minovlen']
            vsearch_args_dic['fastq_truncqual'] = params_dic['fastq_truncqual']

            vsearch_args_dic['fastq_mergepairs'] = fastq_fw_abspath
            vsearch_args_dic['reverse'] = fastq_rv_abspath
            vsearch_args_dic['fastaout'] = out_fasta_path
            vsearch_args_dic['threads'] = num_threads

            vsearch_cluster = RunnerVSearch(parameters=vsearch_args_dic)
            vsearch_cluster.run()

            fastq_info_df_i = fastq_info_df_i[['run', 'marker', 'sample', 'replicate', 'tagfwd',
                                               'primerfwd', 'tagrev', 'primerrev']]
            fastq_info_df_i['mergedfasta'] = fasta_merged_basename
            fastainfo_df = pandas.concat(
                [fastainfo_df, fastq_info_df_i], axis=0)

            with open(out_fasta_path, 'rb') as fin:
                fasta_merged_linecount = int(sum(1 for i in fin.read()) / 4)

            ########################################################################################
            #
            # Summary file
            #
            ########################################################################################

            stats_df = pandas.concat([stats_df, pandas.DataFrame({
                'FastqFwd': [fastq_fw_abspath], 'FastqRev': [fastq_fw_linecount],
                'NbReadsFwd': [fastq_rv_abspath], 'NbReadsRev': [fastq_rv_linecount], 'FastaMerged': [out_fasta_path], 'NbMergedReads': [fasta_merged_linecount]})])
    
        for mergedfasta in fastainfo_df[['mergedfasta']].drop_duplicates().values:
            mergedfasta = mergedfasta[0]

            if mergedfasta.endswith('.bz2') or  mergedfasta.endswith('.gz'):
                fasta_merged_abspath = os.path.join(fastadir, mergedfasta)
                mergedfasta_compressor = FileCompression(fasta_merged_abspath)
            
                if mergedfasta.endswith('.gz'):
                    mergedfasta_c = mergedfasta_compressor.pigz_compression()
                    if mergedfasta_c is None:
                        mergedfasta_c = mergedfasta_compressor.gzip_compression()

                    
                elif mergedfasta.endswith('.bz2'):
                    mergedfasta_c = mergedfasta_compressor.bz2_compression()
                    
                mergedfasta_compressor.delete_file()
                _, relPath = os.path.split(mergedfasta_c)
                fastainfo_df.loc[fastainfo_df['mergedfasta'] == mergedfasta, 'mergedfasta'] = relPath
                
            else: 
                fastq_info_df_i['mergedfasta'] = fasta_merged_basename

        
        fastainfo_df.to_csv(fastainfo, sep="\t", header=True, index=False)
Пример #5
0
    def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), 
        no_reverse=False, tag_to_end=False, primer_to_end=False):
        
        Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}")

        if sys.platform.startswith('win'):
            num_threads = 1

        ############################################################################################
        #
        # params.yml parameters
        #
        ############################################################################################

        params_dic = FileParams(params).get_params_dic()

        cutadapt_error_rate = params_dic['cutadapt_error_rate']
        cutadapt_minimum_length = params_dic['cutadapt_minimum_length']
        cutadapt_maximum_length = params_dic['cutadapt_maximum_length']

        ############################################################################################
        #
        # Loop over tag and primer pairs to demultiplex and trim reads
        #
        ############################################################################################

        merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df()
        
        pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True)
        tempdir = PathManager.instance().get_tempdir()

        merged_fasta_list = []
        results_list = []
        sample_info = {}

        # make sure every file is analysed once.
        for i in range(merged_fastainfo_df.shape[0]):
            if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list:
                merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta)
            
        for mergedfasta in merged_fasta_list:

            inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end)
            
            tagFile_path = inputFiles.tags_file()
            info = inputFiles.get_df_info()

            for key in info.keys():
                if key in sample_info.keys():
                    sample_info[key] = sample_info[key] + info[key]
                else:
                    sample_info[key] = info[key]

            Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta))

            in_raw_fasta_path = os.path.join(fastadir, mergedfasta)

            ########################################################################################
            #
            #   cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile 
            #   --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path
            #
            ########################################################################################

            base = os.path.basename(in_raw_fasta_path)
            base, base_suffix = base.split('.', 1)
            
            out_fasta_path = os.path.join(tempdir, "sorted") 

            cmd_cutadapt_tag_dic = {
                'in_fasta_path': in_raw_fasta_path,
                'out_fasta': out_fasta_path,
                'num_threads': num_threads,
                'tagFile': tagFile_path,
                'base_suffix': base_suffix,
            }

            cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \
                '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \
                .format(**cmd_cutadapt_tag_dic)

            Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str))

            if sys.platform.startswith("win"):
                args = cmd_cutadapt_tag_str
            else:
                args = shlex.split(cmd_cutadapt_tag_str)
            run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

            Logger.instance().info(run_result.stdout.decode())

            inputFiles.remove_tags_file()

            ########################################################################################
            #
            # Trim primers from output
            # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only 
            # --minimum-length minimum_length --maximum-length maximum_length 
            # --output input_path + {name} + suffix outputfile
            #
            ########################################################################################
            
            primers = inputFiles.primers()
            try:
                tags_samples = inputFiles.get_sample_names()
            except Exception as e:
                Logger.instance().error(e)
                return 
            
            for primer in primers:
                
                marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer

                for tag_sample in tags_samples:

                    name, run, marker2, sample, replicate, _, _ = tag_sample
                    
                    if marker not in marker2:
                        continue

                    in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix

                    baseMerge =  mergedfasta.split(".")[0]
                                        
                    outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed"
                    if name.endswith("_reversed"):
                        outname = outname + "_reversed"
                    out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix)

                    results_list.append(out_fasta_path_new)
                    
                    if not "_reversed" in name:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerrev, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerrev).reverse_complement())
                        primerFwd = primerfwd
                        lenPrimerFwd = lenprimerfwd
                        lenPrimerRev = lenprimerrev
                    else:
                        if generic_dna:  # Biopython <1.78
                            primerRev = str(Seq(primerfwd, generic_dna).reverse_complement())
                        else:  # Biopython =>1.78
                            primerRev = str(Seq(primerfwd).reverse_complement())
                        primerFwd = primerrev
                        lenPrimerFwd = lenprimerrev
                        lenPrimerRev = lenprimerfwd


                    cmd_cutadapt_primer_dic = {
                        'in_fasta_path': in_fasta_path,
                        'out_fasta': out_fasta_path_new,
                        'error_rate': cutadapt_error_rate,
                        'num_threads': num_threads,
                        'primerFwd': primerFwd,
                        'primerRev': primerRev,
                        'lenPrimerFwd': lenPrimerFwd,
                        'lenPrimerRev': lenPrimerRev,
                        'read_min_length': cutadapt_minimum_length,
                        'read_max_length': cutadapt_maximum_length,
                    }

                    if not primer_to_end: #works if the command is selected
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)
                    else:
                        cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \
                            '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \
                            '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\
                            '--output {out_fasta} {in_fasta_path}'\
                            .format(**cmd_cutadapt_primer_dic)

                    Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str))

                    if sys.platform.startswith("win"):
                        args = cmd_cutadapt_primer_str
                    else:
                        args = shlex.split(cmd_cutadapt_primer_str)

                    run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

                    Logger.instance().info(run_result.stdout.decode())

        ###################################################################
        #
        # Reverse complement back rc fasta and pool
        #
        ###################################################################   
     
        for file in results_list:
            if "_trimmed" in file:

                out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1])
                in_fasta_path = os.path.join(tempdir, file)

                if out_final_fasta_path.endswith(".gz"):      
                    _open = partial(gzip.open) 
                elif out_final_fasta_path.endswith(".bz2"):
                    _open = partial(bz2.open)
                else:
                    _open = open

                if in_fasta_path.endswith(".gz"):
                    _open2 = partial(gzip.open) 
                elif in_fasta_path.endswith(".bz2"):
                    _open2 = partial(bz2.open) 
                else: 
                    _open2 = open

                if "_reversed" in file:
                    Logger.instance().debug("Pooling fwd and rc reads...")

                    out_final_fasta_path = out_final_fasta_path.replace("_reversed", "")

                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                if not line.startswith('>'):
                                    if generic_dna:  # Biopython <1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip(), generic_dna).reverse_complement()))
                                    else:  # Biopython =>1.78
                                        fout.write("%s\n" % str(
                                            Seq(line.strip()).reverse_complement()))

                                else:
                                    fout.write(line)
                else:
                    with _open(out_final_fasta_path, 'at') as fout:
                        with _open2(in_fasta_path, 'rt') as fin:
                            for line in fin.readlines():
                                fout.write(line)
        
        results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result]

        del sample_info['mergedfasta']
        del sample_info['primerrev']
        del sample_info['primerfwd']
        del sample_info['tagrev']
        del sample_info['tagfwd']

        sample_info['sortedfasta'] = results_list

        sample_info_df = pandas.DataFrame(sample_info)

        fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv')
        sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
Пример #6
0
class ArgParser:

    ############################################################################################
    #
    # Specific parsers
    #
    ############################################################################################

    parser_params = argparse.ArgumentParser(add_help=False)
    parser_params.add_argument('--params',
                               action='store',
                               default=None,
                               help="YML file with parameter values",
                               required=False,
                               type=lambda x: FileParams(params_path=x).
                               argparse_checker_params_file())

    parser_log = argparse.ArgumentParser(add_help=False)
    parser_log.add_argument('--log',
                            dest='log',
                            action='store',
                            help="write log to LOG file.",
                            required=False)

    parser_threads = argparse.ArgumentParser(add_help=False)
    parser_threads.add_argument('--threads',
                                dest='threads',
                                action='store',
                                help="number of threads",
                                required=False,
                                default=multiprocessing.cpu_count())

    parser_verbosity = argparse.ArgumentParser(add_help=False)
    parser_verbosity.add_argument('-v',
                                  dest='log_verbosity',
                                  action='count',
                                  default=0,
                                  required=False,
                                  help="set verbosity level -v or -vv")

    parser_wopmars_db = argparse.ArgumentParser(add_help=False)
    parser_wopmars_db.add_argument('--db',
                                   dest='db',
                                   action='store',
                                   default='db.sqlite',
                                   required=False,
                                   help="database file in SQLITE format")

    parser_wopmars_dryrun = argparse.ArgumentParser(add_help=False)
    parser_wopmars_dryrun.add_argument(
        '--dry-run',
        '-n',
        dest='dryrun',
        action='store_true',
        required=False,
        help="displays only command out without running it")

    parser_wopmars_forceall = argparse.ArgumentParser(add_help=False)
    parser_wopmars_forceall.add_argument('-F',
                                         '--forceall',
                                         dest='forceall',
                                         action='store_true',
                                         help="force rerun all rules",
                                         required=False)

    parser_vtam_main = None

    @classmethod
    def get_main_arg_parser(cls):
        """

        :return:
        """

        ############################################################################################
        #
        # Top-level parser
        #
        ############################################################################################

        # config = RawConfigParser()
        # config.read(os.path.join(PathManager.get_package_path(), 'setup.cfg'))
        # version = config.get('metadata', 'version')

        parser_vtam_main = argparse.ArgumentParser(
            prog='vtam',
            description=
            '%(prog)s {} - VTAM - Validation and Taxonomic Assignation of Metabarcoding Data'
            .format(vtam.__version__))
        parser_vtam_main.add_argument('--version',
                                      action='version',
                                      version='%(prog)s {}'.format(
                                          vtam.__version__))
        subparsers = parser_vtam_main.add_subparsers(title='VTAM sub-commands')

        ############################################################################################
        #
        # create the parsers
        #
        ############################################################################################

        cls.add_parser_example(subparsers=subparsers)

        cls.add_parser_merge(subparsers=subparsers)

        cls.add_parser_random_seq(subparsers=subparsers)

        cls.add_parser_sortreads(subparsers=subparsers)

        cls.add_parser_filter(subparsers=subparsers)

        cls.add_parser_optimize(subparsers=subparsers)

        cls.add_parser_makeKnownOccurrences(subparsers=subparsers)

        cls.add_parser_pool(subparsers=subparsers)

        cls.add_parser_taxassign(subparsers=subparsers)

        cls.add_parser_taxonomy(subparsers=subparsers)

        cls.add_parser_coiblastdb(subparsers=subparsers)

        return parser_vtam_main

    @classmethod
    def add_parser_example(cls, subparsers):
        parser_vtam_merge = subparsers.add_parser(
            'example',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help="generates data for quick start")

        parser_vtam_merge.add_argument('--outdir',
                                       action='store',
                                       help="directory for quick start data",
                                       required=False,
                                       default='example',
                                       type=lambda x: pathlib.Path(x).mkdir(
                                           exist_ok=True, parents=True) or x)

        parser_vtam_merge.set_defaults(command='example')

    @classmethod
    def add_parser_merge(cls, subparsers):
        parser_vtam_merge = subparsers.add_parser(
            'merge',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help="merges paired-end reads")

        parser_vtam_merge.add_argument(
            '--fastqinfo',
            action='store',
            help="input TSV file with paired FASTQ file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_paired_fastq))

        parser_vtam_merge.add_argument(
            '--fastainfo',
            action='store',
            help="output TSV file with merged FASTA file information",
            required=True)

        parser_vtam_merge.add_argument(
            '--fastqdir',
            action='store',
            help="input directory with paired FASTQ files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_merge.add_argument(
            '--fastadir',
            action='store',
            help="output directory with merged FASTA files",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_merge.set_defaults(command='merge')

    @classmethod
    def add_parser_random_seq(cls, subparsers):

        parser_vtam_random_seq = subparsers.add_parser(
            'random_seq',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "make a folder with sample files containing 'size' number of sequences randomly selected from the files in input folder"
        )

        parser_vtam_random_seq.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_random_seq.add_argument(
            '--random_seqdir',
            action='store',
            help=
            "output directory with randomly selected sequences in FASTA format",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_random_seq.add_argument(
            '--random_seqinfo',
            action='store',
            help="output TSV file with output FASTA file information",
            required=True)

        parser_vtam_random_seq.add_argument(
            '--samplesize',
            action='store',
            help="number of sequences to be selected from the input files",
            type=int,
            required=True)

        parser_vtam_random_seq.set_defaults(command='random_seq')

    @classmethod
    def add_parser_sortreads(cls, subparsers):
        parser_vtam_sortreads = subparsers.add_parser(
            'sortreads',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "sorts (Trims and demultiplexes) reads to biological samples and replicates according to the presence of sequence tags and primers"
        )

        parser_vtam_sortreads.add_argument(
            '--fastainfo',
            action='store',
            help="input TSV file with FASTA file information",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_merged_fasta))

        parser_vtam_sortreads.add_argument(
            '--fastadir',
            action='store',
            help="input directory with FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_sortreads.add_argument(
            '--sorteddir',
            action='store',
            help=
            "output directory with sorted reads (Trimmed and demultiplexed) in FASTA files and TSV file with corresponnding FASTA file information ('SORTEDDIR/sortedinfo.tsv')",
            default="out",
            required=True)
        # This attribute will trigger the good command

        parser_vtam_sortreads.add_argument(
            "--no_reverse",
            action="store_false",
            help="don't check reverse sequences",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--tag_to_end",
            action="store_false",
            help="look for tags only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.add_argument(
            "--primer_to_end",
            action="store_false",
            help="look for primers only at the edges of the sequence",
            required=False)

        parser_vtam_sortreads.set_defaults(command='sortreads')

    @classmethod
    def add_parser_filter(cls, subparsers):
        parser_vtam_filter = subparsers.add_parser(
            'filter',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help=
            "filters out sequence artifacts and creates an amplicon sequence variant (ASV) table."
        )

        parser_vtam_filter.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))
        parser_vtam_filter.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)
        parser_vtam_filter.add_argument(
            '--asvtable',
            action='store',
            help=
            "output TSV file for the amplicon sequence variants (ASV) table",
            required=True)

        parser_vtam_filter.add_argument(
            '--cutoff_specific',
            dest='cutoff_specific',
            default=None,
            action='store',
            required=False,
            help=
            "TSV file with variant (col1: variant; col2: cutoff) or variant-replicate "
            "(col1: variant; col2: replicate; col3: cutoff)specific cutoffs",
            type=lambda x: FileCutoffSpecific(x).argparse_checker())

        parser_vtam_filter.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_filter.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with expected (keep) occurrences",
            required=False,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_filter.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' UNTIL one rule, where the rule order looks like:            
1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        parser_vtam_filter.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """execute '%(prog)s' SINCE one rule, where the rule order looks like:
            1. SampleInformation, 2. VariantReadCount, 3. FilterLFN, 4. FilterMinReplicateNumber, 5. FilterPCRerror, 6. FilterChimera, 7. FilterMinReplicateNumber2, 8. FilterRenkonen, 9. FilterMinReplicateNumber3, 10. FilterIndel, 11. FilterCodonStop, 12. ReadCountAverageOverReplicates, 13. MakeAsvTable""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_filter.set_defaults(command='filter')

    @classmethod
    def add_parser_taxassign(cls, subparsers):
        parser_vtam_taxassign = subparsers.add_parser(
            'taxassign',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db
            ],
            help="assigns amplicon sequence variants (ASVs) to taxonomic groups"
        )

        parser_vtam_taxassign.add_argument(
            '--asvtable',
            action='store',
            help=
            "input TSV file with variant sequences and sequence header in the last column",
            required=True,
            type=lambda x: ArgParserChecker.check_taxassign_variants(x))
        parser_vtam_taxassign.add_argument(
            '--output',
            action='store',
            help="output TSV file where the assigned taxa have been added",
            required=True)
        parser_vtam_taxassign.add_argument(
            '--mode',
            dest='mode',
            default="unassigned",
            action='store',
            required=False,
            choices=['unassigned', 'reset'],
            help=
            "the default 'unassigned' mode will only assign 'unassigned' variants"
            "The alternative 'reset' mode will erase the TaxAssign table and reassigned all "
            "input variants")
        parser_vtam_taxassign.add_argument(
            '--blastdbdir',
            action='store',
            help=
            "input directory with (Full or custom one) Blast database files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)
        parser_vtam_taxassign.add_argument(
            '--blastdbname',
            action='store',
            help=
            "input Blast database name, which corresponds to the file name without suffix of the Blast database files",
            required=True)
        parser_vtam_taxassign.add_argument(
            '--taxonomy',
            dest='taxonomy',
            action='store',
            help="""input TSV file with taxonomy information.
        This file is created with the 'taxonomy' sub-command. For instance
        'vtam taxonomy -o taxonomy.tsv' creates the 'taxonomy.tsv' file in the current directory""",
            required=True,
            type=ArgParserChecker.check_taxassign_taxonomy)

        # This attribute will trigger the good command
        parser_vtam_taxassign.set_defaults(command='taxassign')

    @classmethod
    def add_parser_optimize(cls, subparsers):
        parser_vtam_optimize = subparsers.add_parser(
            'optimize',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity, cls.parser_wopmars_db,
                cls.parser_wopmars_dryrun, cls.parser_wopmars_forceall
            ],
            help="finds out optimal parameters for filtering")

        parser_vtam_optimize.add_argument(
            '--sortedinfo',
            action='store',
            help=
            "input TSV file with information about FASTA files containing sorted (trimmed and demultiplexed) reads",
            required=True,
            type=lambda x: FileSampleInformation(x).check_args(
                header=header_sortedread_fasta))

        parser_vtam_optimize.add_argument(
            '--sorteddir',
            action='store',
            help=
            "input directory with sorted (Trimmed and demultiplexed) FASTA files",
            required=True,
            type=ArgParserChecker.check_dir_exists_and_is_nonempty)

        parser_vtam_optimize.add_argument('-o',
                                          '--outdir',
                                          action='store',
                                          help="output directory",
                                          default="out",
                                          required=True)

        parser_vtam_optimize.add_argument(
            '--known_occurrences',
            action='store',
            help="TSV file with known variants",
            required=True,
            type=lambda x: FileKnownOccurrences(
                x).argparse_checker_known_occurrences())

        parser_vtam_optimize.add_argument(
            '--lfn_variant_replicate',
            action='store_true',
            help=
            "if set, VTAM will run the algorithm for the low frequency noise over variant and replicates",
            required=False,
            default=False)

        parser_vtam_optimize.add_argument(
            '-U',
            '--until',
            dest='until',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' UNTIL one rule, where the rules follow this order:
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)
        parser_vtam_optimize.add_argument(
            '-S',
            '--since',
            dest='since',
            action='store',
            default=None,
            help=
            """executes '%(prog)s' SINCE one rule, where the rules follow this order: 
            1. SampleInformation, 2. VariantReadCount, 3. either OptimizeLFNsampleReplicate or OptimizePCRerror or OptimizeLFNreadCountAndLFNvariant""",
            required=False)

        # This attribute will trigger the good command
        parser_vtam_optimize.set_defaults(command='optimize')

    @classmethod
    def add_parser_makeKnownOccurrences(cls, subparsers):
        parser_vtam_makeKnownOccurrences = subparsers.add_parser(
            'make_known_occurrences',
            add_help=True,
            parents=[cls.parser_threads, cls.parser_verbosity],
            help="create a file with know occurrences")

        parser_vtam_makeKnownOccurrences.add_argument(
            '--asvtable',
            action='store',
            help="input an ASV table file (tsv format)",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--sample_types',
            action='store',
            help="input a tsv file with the sample types",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--mock_composition',
            action='store',
            help="input a tsv file with the mock composition",
            required=True,
        )
        #    type=lambda x: FileSampleInformation(x).check_args(
        #        header=header_paired_fastq))

        parser_vtam_makeKnownOccurrences.add_argument(
            '--known_occurrences',
            action='store',
            help=
            "Default: ./known_occurrences.tsv. Output a .tsv file with the known occurences",
            required=False,
            default='./known_occurrences.tsv')

        parser_vtam_makeKnownOccurrences.add_argument(
            '--missing_occurrences',
            action='store',
            help=
            "Default: ./missing_occurrences.tsv. Output a .tsv file with the missing occurences",
            required=False,
            default='./missing_occurrences.tsv')

        parser_vtam_makeKnownOccurrences.add_argument(
            '--habitat_proportion',
            action='store',
            help="Default: 0.5. Input a threshold for habitat proportion",
            required=False,
            default=0.5)

        # This attribute will trigger the good command
        parser_vtam_makeKnownOccurrences.set_defaults(
            command='make_known_occurrences')

    @classmethod
    def add_parser_pool(cls, subparsers):
        parser_vtam_pool_markers = subparsers.add_parser(
            'pool',
            add_help=True,
            parents=[
                cls.parser_params, cls.parser_log, cls.parser_threads,
                cls.parser_verbosity
            ],
            help=
            "pools amplicon sequence variants (ASVs) from different but overlapping markers"
        )

        parser_vtam_pool_markers.add_argument('--db',
                                              action='store',
                                              required=True,
                                              help="SQLITE file with DB")

        from vtam.utils.FileRunMarker import FileRunMarker
        parser_vtam_pool_markers.add_argument(
            '--runmarker',
            action='store',
            default=None,
            help=FileRunMarker.help(),
            required=True,
            type=lambda x: FileRunMarker(x).check_argument())

        parser_vtam_pool_markers.add_argument(
            '--asvtable',
            action='store',
            help=
            "output TSV file with pooled markers and their occurrences in biological samples",
            required=True)

        parser_vtam_pool_markers.add_argument(
            '--readcounts',
            action='store_true',
            help=
            "Default: False. If False, presence/absence of reads in sample is given."
            "If True, sum of reads over pooled runs et/ou markers is given",
            required=False,
            default=False)

        # This attribute will trigger the good command
        parser_vtam_pool_markers.set_defaults(command='pool')

    @classmethod
    def add_parser_taxonomy(cls, subparsers):
        parser_vtam_taxonomy = subparsers.add_parser(
            'taxonomy',
            add_help=True,
            parents=[],
            help="downloads a TSV file with the NCBI taxonomy information")

        parser_vtam_taxonomy.add_argument(
            '-o',
            '--output',
            dest='output',
            action='store',
            help="default: taxonomy.tsv. Path to TSV taxonomy file",
            required=False,
            default=os.path.join(os.getcwd(), 'taxonomy.tsv'))
        parser_vtam_taxonomy.add_argument(
            '--precomputed',
            dest='precomputed',
            action='store_true',
            default=False,
            help="default: False. Downloads precomputed taxonomy database, "
            "which is likely an older database",
            required=False)
        # This attribute will trigger the good command
        parser_vtam_taxonomy.set_defaults(command='taxonomy')

    @classmethod
    def add_parser_coiblastdb(cls, subparsers):
        parser_vtam_coi_blast_db = subparsers.add_parser(
            'coi_blast_db',
            add_help=True,
            help=
            "downloads a precomputed BLAST database for the cytochrome C oxidase subunit I (COI) marker"
        )

        parser_vtam_coi_blast_db.add_argument(
            '--blastdbdir',
            dest='blastdbdir',
            action='store',
            help=
            "output directory with custom Blast database files of the cytochrome C oxidase subunit I (COI) marker files",
            required=False,
            default='blastdb')
        parser_vtam_coi_blast_db.add_argument(
            '--blastdbname',
            dest='blastdbname',
            action='store',
            help=
            "cytochrome C oxidase subunit I (COI) Blast database name among these current possibilities: coi_blast_db, coi_blast_db_20200420. Other versions if available can be found here: {}"
            .format(os.path.dirname(coi_blast_db_gz_url1)),
            required=False,
            default='coi_blast_db',
            type=lambda x: CommandBlastCOI(
                x).argparse_checker_blast_coi_blastdbname(),
        )
        # This attribute will trigger the good command
        parser_vtam_coi_blast_db.set_defaults(command='coi_blast_db')