def setUp(self): asv_table_str = """variant_id marker_name run_name sequence_length read_count sample1 sample2 sample3 chimera_borderline sequence 3 MFZR prerun 176 9713 9712 1 0 FALSE TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 33 MFZR prerun 174 9713 9703 10 0 FALSE CTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 333 ZFZR prerun 157 10000 9900 10 0 FALSE TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCTCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 836 MFZR prerun 176 11588 123 56 0 FALSE TCTATATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 8368 ZFZR prerun 157 545 500 0 45 FALSE TGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT 83683 MFZR prerun 175 484 0 28 456 FALSE TCTAAATTTCATTTTTGGTGCTTGGGCAGGTATGGTAGGGACCTCATTAAGACTTTTAATTCGAGCCGAGTTGGGTAACCCGGGTTCATTAATTGGGGACGATCAAATTTATAACGTAATCGTAACTGCCCATGCCTTTATTATGATTTTTTTTATAGTGATACCTATTATAATT """ asv_table_df = pandas.read_csv(io.StringIO(asv_table_str), sep="\t", header=0) self.asv_table_df = asv_table_df # Create this_tempdir this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_tempdir).mkdir(exist_ok=True) # Define fasta_path tsv_path fasta_path = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__), 'variants.fa') # Create variant variant_read_count_input_df variant_df = asv_table_df[['variant_id', 'sequence', 'read_count' ]].drop_duplicates(inplace=False) variant_df.columns = ['id', 'sequence', 'size'] # Create fasta_path file from asv_table_df variant_df_utils = DataframeVariant(variant_df) variant_df_utils.to_fasta(fasta_path, add_column='size') # Define vsearch output tsv_path vsearch_output_path = os.path.join( PathManager.instance().get_tempdir(), os.path.basename(__file__), 'centroid_out.fa') # Define cluster output tsv_path vsearch_cluster_output_path = os.path.join( PathManager.instance().get_tempdir(), os.path.basename(__file__), 'cluster.fa') # # Create object and run_name vsearch os.environ["VTAM_THREADS"] = "1" vsearch_parameters = { '--cluster_size': fasta_path, '--clusters': vsearch_cluster_output_path, '--id': 1, '--sizein': None, '--centroids': vsearch_output_path, "--threads": int(os.getenv('VTAM_THREADS')), } vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters) vsearch_cluster.run()
def setUp(self): """>parent1;size=650 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG >parent2;size=700 AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAA CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera1;size=50 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera2;size=300 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTG CAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG >Chimera3;size=50 TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG""" """(vtam_appli) gonzalez@milan:~/tmp/vsearch_uchime$ vsearch --uchime_denovo i.fa --borderline borderline.fa --nonchimeras nonchimeras.fa --chimeras chimeras.fa vsearch v2.7.0_linux_x86_64, 15.5GB RAM, 8 cores https://github.com/torognes/vsearch Reading file i.fa 100% 1500 nt in 5 seqs, min 300, max 300, avg 300 Masking 100% Sorting by abundance 100% Counting k-mers 100% Detecting chimeras 100% Found 2 (40.0%) chimeras, 2 (40.0%) non-chimeras, and 1 (20.0%) borderline sequences in 5 unique sequences. Taking abundance information into account, this corresponds to 350 (20.0%) chimeras, 1350 (77.1%) non-chimeras, and 50 (2.9%) borderline sequences in 1750 total sequences""" # Input from min_replicate_number # Variants 1 and 2 are ok but 3-5 are chimeras self.variant_df = pandas.DataFrame( data={ 'sequence': [ 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATAATTAGTTGG', 'AACTATGTACACAAATTTTAGTATATTGGCAGGGATAGTAGGAACTTTACTATCGTTAGTTATCAGAATGGAATTATCAACAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCAGGAAACATGTTAGATGGAGACGGTCAACAATATAACGTAATCGTAACCGCACATGGATTAATAATGATATTCTTCGTGGTTATGCCGGCAATGTTAGGAGGATTTGCAAACTGGTTCATACCAATAATGGTAGGATCACCAGATGTAGCTTTTCCAAGATTAAACAACATTAGCTTATGGTTAATATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', ], }, index=list(range(1, 6)), ) # self.variant_read_count_df = pandas.DataFrame({ 'run_id': [1] * 5, 'marker_id': [1] * 5, 'sample_id': [1] * 5, 'replicate': [1] * 5, 'variant_id': list(range(1, 6)), 'read_count': [650, 700, 50, 350, 50], }) self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True) os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count())
def pip_install_vtam_for_tests(): """This function is used in the tests when the vtam command is run""" cmd = '{} -m pip install . -q --upgrade'.format(sys.executable) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, check=True, cwd=PathManager.instance().get_project_path())
def __init__(self, variant_fasta, blast_db_dir, blast_db_name, num_threads, qcov_hsp_perc): self.variant_fasta = variant_fasta self.blast_db_dir = blast_db_dir self.blast_db_name = blast_db_name # self.ltg_rule_threshold = ltg_rule_threshold # self.include_prop = include_prop # self.min_number_of_taxa = min_number_of_taxa self.num_threads = num_threads self.qcov_hsp_perc = qcov_hsp_perc self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True, parents=True)
def __init__(self, variant_expected_df, variant_unexpected_df, variant_read_count_df): """ Initiates object for the PCR error filter :param variant_expected_df: DataFrame (id, sequence) with expected variants :param variant_unexpected_df: DataFrame (id, sequence) with unexpected variants :param variant_read_count_df: DataFrame (run_id, marker_id, sample_id, replicate, variant_id, read_count) """ self.__variant_expected_df = variant_expected_df self.__variant_unexpected_df = variant_unexpected_df self.__variant_read_count_df = variant_read_count_df self.__tmp_dir = os.path.join(PathManager.instance().get_tempdir(), self.__class__.__name__) pathlib.Path(self.__tmp_dir).mkdir(parents=True, exist_ok=True)
def compute_clusters(self): tempcluster_dir = PathManager.instance().get_tempdir() i_fas = os.path.join(tempcluster_dir, 'cluster_input.fas') with open(i_fas, 'w') as fout: for idx, row in self.variant_info_df.iterrows(): valdict = {} valdict['variant_id'] = row.variant_id valdict['read_count'] = row.read_count valdict['sequence'] = row.sequence fout.write( ">{variant_id};size={read_count}\n{sequence}\n".format( **valdict)) cmd = "vsearch --cluster_size cluster_input.fas --id {} --otutabout otutabout.txt --clusters test".format( self.cluster_identity) if sys.platform.startswith("win"): args = cmd else: args = shlex.split(cmd) subprocess.run(args=args, cwd=tempcluster_dir) otutabout_path = os.path.join(tempcluster_dir, "otutabout.txt") otutabout_df = pandas.read_csv(otutabout_path, sep="\t") otutabout_df.rename({'#OTU ID': 'centroid'}, axis=1, inplace=True) otutabout_long_df = pandas.melt(otutabout_df, id_vars=['centroid'], var_name='variant_id', value_name='read_count') otutabout_long_df.rename({'centroid': 'clusterid'}, axis=1, inplace=True) otutabout_long_df = otutabout_long_df.loc[ otutabout_long_df.read_count > 0] otutabout_long_df.variant_id = otutabout_long_df.variant_id.astype( 'int') cluster_count_df = otutabout_long_df[['clusterid', 'variant_id' ]].groupby('clusterid').count() cluster_count_df.rename({'variant_id': 'clustersize'}, axis=1, inplace=True) cluster_count_df = otutabout_long_df[['clusterid', 'variant_id' ]].merge(cluster_count_df, on='clusterid') return cluster_count_df
def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file paths known_occurrences_tsv = self.input_file( OptimizePCRerror.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizePCRerror.__input_file_sortedinfo) # # Output file paths output_optimize_path = self.output_file( OptimizePCRerror.__output_file_optimize_pcr_error) ############################################################################################ # # Get nijk_df, known_occurrences_df # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_pcr_error_runner = RunnerOptimizePCRerror( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path, engine=engine)
def __init__(self, asv_table_df, readcounts, run_marker_df=None): """ Constructor of the CommandPoolRunMarkers class Parameters ---------- asv_table_df : pandas dataframe ASV table. readcount : bool Default false. If false, boolean 0/1 is given for presence or absence of variant in pooled table. If true, read integer is given with sum or reads in the pooled runs or markers. run_marker_df: pandas dataframe Output ASV table with pooled variants """ header = { 'run_name', 'marker_name', 'variant_id', 'sequence_length', 'read_count' } if not set(asv_table_df.columns ) >= header: # contains at least the 'header_lower' columns Logger.instance().error( VTAMexception( "The ASV table structure is wrong. It is expected to contain these columns: " "run_name, marker_name, variant_id, sequence_length, read_count" )) sys.exit(1) self.sample_names = asv_table_df.columns.tolist()[5:-2] if run_marker_df is None: # Default: pool all marker_name self.asv_table_df = asv_table_df else: # if run_marker_df: pool only markers in this variant_read_count_input_df self.asv_table_df = asv_table_df.merge( run_marker_df, on=['run_name', 'marker_name']) self.tmp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.tmp_dir).mkdir(exist_ok=True) self.cluster_path = None # returned by run_vsearch_to_cluster_sequences self.cluster_df = None # returned by get_vsearch_clusters_to_df self.readcounts = readcounts # returned by get_vsearch_clusters_to_df
def __init__(self, command, cli_args_dic): """ :param command: takes one of two values: filter or optimize :param cli_args_dic: dictionnary (CLIargumentDict.instance()) with command """ self.command = command self.cli_args_and_numerical_params = {} self.cli_args_and_numerical_params.update(cli_args_dic) # Add user params_lfn_variant.yml parameters params_dic = FileParams(cli_args_dic['params']).get_params_dic() self.cli_args_and_numerical_params.update(params_dic) self.wopfile_path = None self.tempdir = PathManager.instance().get_tempdir()
def __init__(self, taxonomy_tsv=None): """ :param taxonomy_tsv: Path to the taxonomy_tsv. Default None :type taxonomy_tsv: str :rtype: None """ if taxonomy_tsv is None: # If None, download to current wdir self.taxonomy_tsv_path = os.path.join(os.getcwd(), "taxonomy.tsv") else: # Download to tsv_path self.taxonomy_tsv_path = taxonomy_tsv pathlib.Path(os.path.dirname(taxonomy_tsv)).mkdir(parents=True, exist_ok=True) self.tempdir = PathManager.instance().get_tempdir() package_path = os.path.join(PathManager.get_package_path()) self.taxonomy_tsv_gz_path = os.path.join(package_path, "..", "data", "taxonomy.tsv.gz")
def setUp(self): os.environ['VTAM_THREADS'] = str(multiprocessing.cpu_count()) # Input from min_replicate_number self.variant_df = pandas.DataFrame( { 'sequence': [ 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGATGGTTTTGCTGGTGTTTTAGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATTAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', 'TGTTCTTTATTTATTATTTGCTGGTTTTGCTGGTGTTTTCGCTGTAACTTTATCATTATCAATTAGATTACAATTAGTTGCTACTGGGTATGGATGATTAGCTTTGAATTATCAATTTTATAACACTATTGTAACTGCTCATGGATTATTAATAGTATTTTTTCTCCTTATGCCTGCTTTAATAGGTGGTTTTGGTAATTGAATAGTTCCTGTTCTAATTGGTTCTATTGATATGGCTTACCCTAGATTAAATAATATTAGTTTTTGATTATTGCCCCCTAGTTTATTATTATTAGTTGG', ], }, index=list(range(1, 5))) # self.variant_read_count_df = pandas.DataFrame({ 'run_id': [1] * 8, 'marker_id': [1] * 8, 'sample_id': [1] * 8, 'replicate': [1, 2] * 4, 'variant_id': [1] * 2 + [2] * 2 + [3] * 2 + [4] * 2, 'read_count': [ 350, 300, 300, 220, 60, 0, 2, 0, ], }) self.this_tempdir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_tempdir).mkdir(parents=True, exist_ok=True)
def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # # Input file output fasta_info_tsv = self.input_file( FilterPCRerror.__input_file_sortedinfo) # # Input table models input_filter_min_replicate_model = self.input_table( FilterPCRerror.__input_table_filter_min_replicate_number) # # Options pcr_error_var_prop = self.option("pcr_error_var_prop") # # Output table models output_filter_pcr_error_model = self.output_table( FilterPCRerror.__output_table_filter_pcr_error) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine, filter_id=None) ############################################################################################ # # Run per sample_id # ############################################################################################ variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine) record_list = [] run_marker_sample_df = variant_read_count_df[[ 'run_id', 'marker_id', 'sample_id' ]].drop_duplicates() for row in run_marker_sample_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id # Get variant read for the current run-marker-sample variant_read_count_per_sample_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id) & (variant_read_count_df.sample_id == sample_id)] variant_per_sample_df = variant_df.loc[variant_df.index.isin( variant_read_count_per_sample_df.variant_id.unique().tolist())] this_step_tmp_per_sample_dir = os.path.join( this_temp_dir, "run_{}_marker_{}_sample{}".format(run_id, marker_id, sample_id)) pathlib.Path(this_step_tmp_per_sample_dir).mkdir(exist_ok=True) ######################################################################################## # # Run vsearch and get alignement variant_read_count_input_df # ######################################################################################## filter_pcr_error_runner = RunnerFilterPCRerror( variant_expected_df=variant_per_sample_df, variant_unexpected_df=variant_per_sample_df, variant_read_count_df=variant_read_count_per_sample_df) filter_output_per_sample_df = filter_pcr_error_runner.get_variant_read_count_delete_df( pcr_error_var_prop) ######################################################################################## # # Per sample add to record list # ######################################################################################## record_per_sample_list = ModelVariantReadCountLike.filter_delete_df_to_dict( filter_output_per_sample_df) record_list = record_list + record_per_sample_list variant_read_count_delete_df = pandas.DataFrame.from_records( data=record_list) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count()): if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation( fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() sorted_read_info_df = pandas.DataFrame() for i in range(0, merged_fastainfo_df.shape[0]): fasta_info_series = merged_fastainfo_df.iloc[i] tag_fwd = fasta_info_series.tagfwd tag_rev = fasta_info_series.tagrev primer_fwd = fasta_info_series.primerfwd primer_rev = fasta_info_series.primerrev in_fasta_basename = fasta_info_series.mergedfasta Logger.instance().debug( "Analysing FASTA file: {}".format(in_fasta_basename)) fasta_info_df_i = fasta_info_series.to_frame().T in_raw_fasta_path = os.path.join(fastadir, in_fasta_basename) ######################################################################################## # # Cut adapt tag of forward reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tcgatcacgatgt;min_overlap=13...gctgtagatcgaca;min_overlap=14' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_rev_rc = str( Seq(tag_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_rev_rc = str(Seq(tag_rev).reverse_complement()) out_fasta_basename = os.path.basename(in_raw_fasta_path).replace( '.fasta', '_sorted_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_fwd, 'tag_fwd_len': len(tag_fwd), 'tag_rev_rc': tag_rev_rc, 'tag_rev_rc_len': len(tag_rev_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True, check=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'TCCACTAATCACAARGATATTGGTAC;min_overlap=26...GGAGGATTTGGWAATTGATTAGTW;min_overlap=24' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_sorted_000.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 primer_rev_rc = str( Seq(primer_rev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_rev_rc = str(Seq(primer_rev).reverse_complement()) in_fasta_path = out_fasta_path out_fasta_basename = os.path.basename(in_fasta_path).replace( '_sorted_%03d.fasta' % i, '_sorted_trimmed_%03d.fasta' % i) out_fasta_path = os.path.join(tempdir, out_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_fwd, 'primer_fwd_len': len(primer_fwd), 'primer_rev_rc': primer_rev_rc, 'primer_rev_rc_len': len(primer_rev_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ######################################################################################## # # Cut adapt tag of reverse-complement reads # cutadapt --cores=8 --no-indels --error-rate 0 --trimmed-only # --front 'tgtcgatctacagc;min_overlap=14...acatcgtgatcga;min_overlap=13' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # out/control_mfzr/merged/MFZR1_S4_L001_R1_001_merged.fasta # ######################################################################################## if generic_dna: # Biopython <1.78 tag_fwd_rc = str( Seq(tag_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 tag_fwd_rc = str(Seq(tag_fwd).reverse_complement()) out_rc_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_rc_sorted_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_tag_dic = { 'tag_fwd': tag_rev, 'tag_fwd_len': len(tag_rev), 'tag_rev_rc': tag_fwd_rc, 'tag_rev_rc_len': len(tag_fwd_rc), 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_rc_fasta_path, 'num_threads': num_threads, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '--front "{tag_fwd};min_overlap={tag_fwd_len}...{tag_rev_rc};min_overlap={tag_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Trim primers from output # cutadapt --cores=8 --no-indels --error-rate 0.1 --minimum-length 50 --maximum-length 500 --trimmed-only # --front 'WACTAATCAATTWCCAAATCCTCC;min_overlap=24...GTACCAATATCYTTGTGATTAGTGGA;min_overlap=26' # --output /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_trimmed_000.fasta # /tmp/tmpcqlhktae/MFZR1_S4_L001_R1_001_merged_rc_sorted_000.fasta # ################################################################### if generic_dna: # Biopython <1.78 primer_fwd_rc = str( Seq(primer_fwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primer_fwd_rc = str(Seq(primer_fwd).reverse_complement()) in_fasta_path = out_rc_fasta_path out_rc_fasta_basename = os.path.basename(in_fasta_path).replace( '_rc_sorted_%03d.fasta' % i, '_rc_sorted_trimmed_%03d.fasta' % i) out_rc_fasta_path = os.path.join(tempdir, out_rc_fasta_basename) cmd_cutadapt_primer_dic = { 'primer_fwd': primer_rev, 'primer_fwd_len': len(primer_rev), 'primer_rev_rc': primer_fwd_rc, 'primer_rev_rc_len': len(primer_fwd_rc), 'in_fasta_path': in_fasta_path, 'out_fasta': out_rc_fasta_path, 'error_rate': cutadapt_error_rate, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, 'num_threads': num_threads, } cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} ' \ '--maximum-length {read_max_length} --trimmed-only ' \ '--front "{primer_fwd};min_overlap={primer_fwd_len}...{primer_rev_rc};min_overlap={primer_rev_rc_len}" ' \ '--output {out_fasta} {in_fasta_path}'.format(**cmd_cutadapt_primer_dic) Logger.instance().debug( "Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, capture_output=True) Logger.instance().info(run_result.stdout.decode()) Logger.instance().info(run_result.stderr.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### out_final_fasta_basename = os.path.basename( in_raw_fasta_path).replace('.fasta', '_%03d.fasta' % i) out_final_fasta_path = os.path.join(sorteddir, out_final_fasta_basename) shutil.copy(out_fasta_path, out_final_fasta_path) Logger.instance().debug("Pooling fwd and rc reads...") with open(out_final_fasta_path, 'a') as fout: with open(out_rc_fasta_path, 'r') as fin: for line in fin: if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) fasta_info_df_i = fasta_info_df_i[[ 'run', 'marker', 'sample', 'replicate' ]] fasta_info_df_i['sortedfasta'] = out_final_fasta_basename sorted_read_info_df = pandas.concat( [sorted_read_info_df, fasta_info_df_i], axis=0) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sorted_read_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)
def setUp(self): self.tempdir = PathManager.instance().get_tempdir() pathlib.Path(self.tempdir).mkdir(parents=True, exist_ok=True)
def __init__(self, sequence_list, taxonomy, blast_db_dir, blast_db_name, num_threads, params): """ Parameters ---------- sequence_list : list List of se param2 : str The second parameter. """ self.old_tax_id_df = taxonomy.old_tax_df self.taxonomy_df = taxonomy.df self.blast_db_dir = blast_db_dir self.this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(self.this_temp_dir).mkdir(exist_ok=True) self.num_threads = num_threads ####################################################################### # # Parameters # ####################################################################### params_dic = FileParams(params).get_params_dic() qcov_hsp_perc = params_dic['qcov_hsp_perc'] ####################################################################### # # 2 Create FASTA file with Variants # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Create SortedReadFile from Variants".format( __file__, inspect.currentframe().f_lineno)) variant_fasta = os.path.join(self.this_temp_dir, 'variant.fasta') with open(variant_fasta, 'w') as fout: for seq in sequence_list: fout.write(">{}\n{}\n".format(seq, seq)) ####################################################################### # # 3 Run local blast # ####################################################################### runner_blast = RunnerBlast(variant_fasta, blast_db_dir, blast_db_name, num_threads, qcov_hsp_perc) # run blast blast_output_tsv = runner_blast.run_local_blast() # process blast results blast_output_df = RunnerBlast.process_blast_result(blast_output_tsv) ####################################################################### # # Compute tax lineages for Blast target tax ids # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Open taxonomy.tsv DB".format( __file__, inspect.currentframe().f_lineno)) blast_output_df.target_tax_id = pandas.to_numeric( blast_output_df.target_tax_id) # Logger.instance().debug( "file: {}; line: {}; Annotate each target_tax_id with its lineage as columns in wide format" .format(__file__, inspect.currentframe().f_lineno)) tax_id_list = blast_output_df.target_tax_id.unique().tolist() tax_id_to_lineage_df = taxonomy.get_several_tax_id_lineages( tax_id_list) ####################################################################### # # Merge tax lineages and the blast result # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Merge blast result including tax_id with their lineages" .format(__file__, inspect.currentframe().f_lineno)) # Merge local blast output with tax_id_to_lineage_df # variant_identity_lineage_df = blast_output_df.merge( # tax_id_to_lineage_df, left_on='target_tax_id', right_on='tax_id') variantid_identity_lineage_df = blast_output_df.merge( tax_id_to_lineage_df, left_on='target_tax_id', right_index=True) # variant_identity_lineage_df.drop('tax_id', axis=1, inplace=True) """(Pdb) variant_identity_lineage_df.columns Index(['variant_id', 'target_id', 'identity', 'evalue', 'coverage', 'target_tax_id', 'no rank', 'species', 'genus', 'family', 'order', 'class', 'subphylum', 'phylum', 'subkingdom', 'kingdom', 'superkingdom', 'superfamily', 'infraorder', 'suborder', 'infraclass', 'subclass', 'tribe', 'subfamily', 'cohort', 'subgenus', 'subspecies', 'parvorder', 'superorder', 'subcohort', 'superclass', 'species group', 'subtribe', 'section', 'varietas', 'species subgroup'], dtype='object')""" ####################################################################### # # several_variants_to_ltg # this function returns a data frame containing the Ltg rank and Ltg Tax_id for each variant # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Main loop over variant and identity to" "compute the whole set of ltg_tax_id and ltg_rank for each variant_id" "to a dataframe".format(__file__, inspect.currentframe().f_lineno)) runner_ltg_selection = RunnerLTGselection( variant_identity_lineage_df=variantid_identity_lineage_df, taxonomy_df=self.taxonomy_df, params=params) self.ltg_df = runner_ltg_selection.several_variants_to_ltg()
def __init__(self, sys_argv): ############################################################################################ # # Parse arguments # ############################################################################################ self.sys_argv = sys_argv # AG do not use abspath for the moment. Maybe later it can be used as # option parser = ArgParser.get_main_arg_parser() self.args = parser.parse_args(sys_argv) arg_parser_dic = vars(self.args) ############################################################################################ # # If non-specified, initiate params.yml # ############################################################################################ if 'params' in arg_parser_dic and arg_parser_dic['params'] is None: params_yml = os.path.join(PathManager.instance().get_configdir(), "params.yml") if not os.path.isfile(params_yml): pathlib.Path(params_yml).touch(exist_ok=False) arg_parser_dic['params'] = params_yml ############################################################################################ # # Parse log arguments # ############################################################################################ if 'log_verbosity' in arg_parser_dic: (LoggerArguments.instance()).update({'log_verbosity': arg_parser_dic['log_verbosity']}) os.environ['VTAM_LOG_VERBOSITY'] = str( arg_parser_dic['log_verbosity']) if 'log' in arg_parser_dic: (LoggerArguments.instance()).update({'log': arg_parser_dic['log']}) os.environ['VTAM_LOG_FILE'] = str(arg_parser_dic['log']) ####################################################################### # # Set arguments, logger # ####################################################################### # Some arguments will be passed through environmental variables if 'threads' in arg_parser_dic: os.environ['VTAM_THREADS'] = str(arg_parser_dic['threads']) ############################################################################################ # # Subcommands: wopfile-dependent, filter, optimize # ############################################################################################ if arg_parser_dic['command'] in ['filter', 'optimize']: if arg_parser_dic['command'] in ['filter']: #################################################################################### # # Verify coherence of --lfn_variant_replicate and params arguments # #################################################################################### with open(arg_parser_dic['params']) as fin: # The FullLoader parameter handles the conversion from YAML # scalar values to Python the dictionary format params_dic = yaml.load(fin, Loader=yaml.SafeLoader) or {} if arg_parser_dic['lfn_variant_replicate']: if 'lfn_variant_cutoff' in params_dic: Logger.instance().error(VTAMexception( 'The parameter "lfn_variant_cutoff" in the parameter file "{}" is incompatible with' ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params']))) sys.exit(1) else: if 'lfn_variant_replicate_cutoff' in params_dic: Logger.instance().error(VTAMexception( 'The parameter "lfn_variant_replicate_cutoff" in the parameter file "{}" needs' ' the --lfn_variant_replicate argument.'.format(arg_parser_dic['params']))) sys.exit(1) #################################################################################### # # Verify coherence of --lfn_variant_replicate and cutoff_specific argument # #################################################################################### if not (arg_parser_dic['cutoff_specific'] is None): # cutoff specific argument if arg_parser_dic['lfn_variant_replicate']: # lfn_variant_replicate # cutoff_specific for lfn_variant if not FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate(): Logger.instance().error('The --lfn_variant_replicate argument is incompatible with the cutoff_specific file {}.'.format( arg_parser_dic['cutoff_specific'])) sys.exit(1) else: # lfn_variant # cutoff_specific for lfn_variant_replicate if FileCutoffSpecific(arg_parser_dic['cutoff_specific']).is_compatible_lfn_variant_replicate(): Logger.instance().error('The cutoff_specific file {} requires the --lfn_variant_replicate argument.'.format( arg_parser_dic['cutoff_specific'])) sys.exit(1) ############################################################################################ # # If non-specified, initiate cutoff specific # ############################################################################################ if arg_parser_dic['cutoff_specific'] is None: cutoff_specific_tsv = os.path.join(PathManager.instance().get_configdir(), "cutoff_specific.tsv") if not os.path.isfile(cutoff_specific_tsv): pathlib.Path(cutoff_specific_tsv).touch(exist_ok=False) arg_parser_dic['cutoff_specific'] = cutoff_specific_tsv CommandFilterOptimize.main(arg_parser_dic=arg_parser_dic) ############################################################################################ # # Subcommand: example # ############################################################################################ elif arg_parser_dic['command'] == 'example': outdir = arg_parser_dic['outdir'] CommandExample.main(outdir=outdir) ############################################################################################ # # Subcommand: merge # ############################################################################################ elif arg_parser_dic['command'] == 'merge': fastqinfo = arg_parser_dic['fastqinfo'] fastqdir = arg_parser_dic['fastqdir'] fastainfo = arg_parser_dic['fastainfo'] fastadir = arg_parser_dic['fastadir'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandMerge.main(fastqinfo=fastqinfo, fastqdir=fastqdir, fastainfo=fastainfo, fastadir=fastadir, params=params, num_threads=num_threads) ############################################################################################ # # Subcommand: sortreads # ############################################################################################ elif arg_parser_dic['command'] == 'sortreads': fastadir = arg_parser_dic['fastadir'] fastainfo = arg_parser_dic['fastainfo'] sorteddir = arg_parser_dic['sorteddir'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandSortReads.main(fastainfo=fastainfo, fastadir=fastadir, params=params, num_threads=num_threads, sorteddir=sorteddir) ############################################################################################ # # Subcommand: taxassign # ############################################################################################ elif arg_parser_dic['command'] == 'taxassign': db = arg_parser_dic['db'] asvtable_tsv = arg_parser_dic['asvtable'] output = arg_parser_dic['output'] mode = arg_parser_dic['mode'] taxonomy_tsv = arg_parser_dic['taxonomy'] blasdb_dir_path = arg_parser_dic['blastdbdir'] blastdbname_str = arg_parser_dic['blastdbname'] num_threads = arg_parser_dic['threads'] params = arg_parser_dic['params'] CommandTaxAssign.main(db=db, mode=mode, asvtable_tsv=asvtable_tsv, output=output, taxonomy_tsv=taxonomy_tsv, blastdb_dir_path=blasdb_dir_path, blastdbname_str=blastdbname_str, params=params, num_threads=num_threads) ############################################################################################ # # Subcommand: pool # ############################################################################################ elif arg_parser_dic['command'] == 'pool': db = arg_parser_dic['db'] readcounts = arg_parser_dic['readcounts'] run_marker_tsv = arg_parser_dic['runmarker'] pooled_marker_tsv = arg_parser_dic['asvtable'] params = arg_parser_dic['params'] CommandPoolRunMarkers.main(db=db, pooled_marker_tsv=pooled_marker_tsv, run_marker_tsv=run_marker_tsv, params=params, readcounts=readcounts) ############################################################################################ # # Subcommand: taxonomy # ############################################################################################ elif arg_parser_dic['command'] == 'taxonomy': taxonomy_tsv = arg_parser_dic['output'] precomputed = arg_parser_dic['precomputed'] taxonomy = CommandTaxonomy(taxonomy_tsv=taxonomy_tsv) taxonomy.main(precomputed=precomputed) ############################################################################################ # # Subcommand: coi blast # ############################################################################################ elif arg_parser_dic['command'] == 'coi_blast_db': blastdbdir = arg_parser_dic['blastdbdir'] blastdbname = arg_parser_dic['blastdbname'] coi_blast_db = CommandBlastCOI(blastdbname=blastdbname) coi_blast_db.download(blastdbdir=blastdbdir) ############################################################################################ # # Else: run_name usage message # ############################################################################################ else: self.args = parser.parse_args(['--help']) # if command unknown print help
def get_variant_read_count_delete_df(self, variant_df, uchime3_denovo_abskew): temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(temp_dir).mkdir(exist_ok=True) filter_output_chimera_df = self.variant_read_count_df.copy() filter_output_chimera_df['filter_delete'] = False # filter_output_borderline_df = self.variant_read_count_df.copy() filter_output_borderline_df['filter_delete'] = False run_marker_sample_df = self.variant_read_count_df[[ 'run_id', 'marker_id', 'sample_id' ]].drop_duplicates(inplace=False) for row in run_marker_sample_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id variant_read_count_df = self.variant_read_count_df.loc[ (self.variant_read_count_df.run_id == run_id) & (self.variant_read_count_df.marker_id == marker_id) & (self.variant_read_count_df.sample_id == sample_id)] variant_read_count_df_obj = DataframeVariantReadCountLike( variant_read_count_df=variant_read_count_df) N_i_df = variant_read_count_df_obj.get_N_i_df() variant_size_df = variant_df.merge(N_i_df, left_index=True, right_on='variant_id') variant_size_df = variant_size_df[[ 'variant_id', 'sequence', 'N_i' ]] variant_size_df.rename(columns={'N_i': 'size'}, inplace=True) variant_size_df.set_index('variant_id', inplace=True) ################################################################### # # Sort variants by abundance and write to fasta_path # ################################################################### variant_size_df.sort_values(by='size', ascending=False, inplace=True) variant_df_utils_obj = DataframeVariant(variant_size_df) uchime_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}.fasta'.format( run_id, marker_id, sample_id)) variant_df_utils_obj.to_fasta(fasta_path=uchime_fasta_path, add_column="size") ################################################################### # # Run uchime_denovo # ################################################################### uchime_borderline_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}_borderline.fasta'.format( run_id, marker_id, sample_id)) uchime_nonchimeras_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_id_{}_nonchimeras.fasta'.format( run_id, marker_id, sample_id)) uchime_chimeras_fasta_path = os.path.join( temp_dir, 'run_{}_marker_{}_sample_{}_chimeras.fasta'.format( run_id, marker_id, sample_id)) # # Create object and run_name vsearch vsearch_parameters = { 'uchime3_denovo': uchime_fasta_path, 'borderline': uchime_borderline_fasta_path, 'nonchimeras': uchime_nonchimeras_fasta_path, 'chimeras': uchime_chimeras_fasta_path, 'abskew': uchime3_denovo_abskew, } vsearch_cluster = RunnerVSearch(parameters=vsearch_parameters) vsearch_cluster.run() ################################################################### # # 4. Delete variant from replicate/sample if chimeras # ################################################################### Logger.instance().debug( "Vsearch uchime chimera tsv_path: {}".format( uchime_chimeras_fasta_path)) with open(uchime_chimeras_fasta_path, "r") as handle: for chimera_seqrecord in SeqIO.parse(handle, "fasta"): variant_id = int(chimera_seqrecord.id.split(';')[0]) filter_output_chimera_df.loc[ (filter_output_chimera_df['run_id'] == run_id) & (filter_output_chimera_df['marker_id'] == marker_id) & (filter_output_chimera_df['sample_id'] == sample_id) & (filter_output_chimera_df['variant_id'] == variant_id), 'filter_delete'] = True Logger.instance().debug( "Vsearch uchime chimera borderline tsv_path: {}".format( uchime_borderline_fasta_path)) with open(uchime_borderline_fasta_path, "r") as handle: for chimera_seqrecord in SeqIO.parse(handle, "fasta"): variant_id = int(chimera_seqrecord.id.split(';')[0]) filter_output_borderline_df.loc[ (filter_output_borderline_df['run_id'] == run_id) & (filter_output_borderline_df['marker_id'] == marker_id) & (filter_output_borderline_df['sample_id'] == sample_id) & (filter_output_borderline_df['variant_id'] == variant_id), 'filter_delete'] = True return filter_output_chimera_df, filter_output_borderline_df
def main(cls, db, mode, asvtable_tsv, output, taxonomy_tsv, blastdb_dir_path, blastdbname_str, num_threads=multiprocessing.cpu_count(), params=None): """ Parameters ---------- db: str Path to SQLITE database with Variant and Taxassign tables mode asvtable_tsv output taxonomy_tsv blastdb_dir_path blastdbname_str num_threads params Returns ------- """ this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ####################################################################### # # Parameters # ####################################################################### # params_dic = constants.get_params_default_dic() # params_dic = FileParams(params).get_params_dic() # ltg_rule_threshold = params_dic['ltg_rule_threshold'] # include_prop = params_dic['include_prop'] # min_number_of_taxa = params_dic['min_number_of_taxa'] # qcov_hsp_perc = params_dic['qcov_hsp_perc'] ####################################################################### # # Load db and tables as classes and delete taxassign in reset mode # ####################################################################### engine = sqlalchemy.create_engine('sqlite:///{}'.format(db), echo=False) variant_declarative_table = Variant.__table__ variant_declarative_table.create(bind=engine, checkfirst=True) tax_assign_declarative_table = TaxAssign.__table__ tax_assign_declarative_table.create(bind=engine, checkfirst=True) if mode == 'reset': with engine.connect() as conn: conn.execute(tax_assign_declarative_table.delete()) ####################################################################### # # Use variants that are not already already assigned in TaxAssign # ####################################################################### variant_input_df = pandas.read_csv(asvtable_tsv, sep="\t", header=0) # get list of variant sequences variant_sequence_list = variant_input_df.sequence.tolist() # Add variant to DB if not already there for variant_sequence in variant_sequence_list: with engine.connect() as conn: row_variant = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first() if row_variant is None: # variant_sequence IS NOT in the database, so INSERT it conn.execute(variant_declarative_table.insert().values( sequence=variant_sequence)) ####################################################################### # # Get already tax-assigned variants with all informations including sequence # ####################################################################### stmt_variant_tax_assign = sqlalchemy.select([ tax_assign_declarative_table.c.variant_id, tax_assign_declarative_table.c.identity, tax_assign_declarative_table.c.ltg_rank, tax_assign_declarative_table.c.ltg_tax_id, tax_assign_declarative_table.c.ltg_tax_name, tax_assign_declarative_table.c.blast_db, variant_declarative_table.c.sequence, ])\ .where(tax_assign_declarative_table.c.ltg_tax_id.isnot(None))\ .where(tax_assign_declarative_table.c.variant_id == variant_declarative_table.c.id)\ .where(variant_declarative_table.c.sequence.in_(variant_sequence_list))\ .distinct() # These are the variants that are already in taxassign and do not need # recalculate ltg_from_db_list = [] with engine.connect() as conn: for row in conn.execute(stmt_variant_tax_assign).fetchall(): ltg_from_db_list.append(dict(zip(row.keys(), row.values()))) """(Pdb) pandas.DataFrame.from_records(ltg_from_db_list) identity ltg_rank ltg_tax_id ltg_tax_name sequence variant_id 0 100 species 2028017 Orthocladiinae sp. BAP34 AGCATGATCTGGAATAGTAGGTACTTCCCTTAGTATCTTAATTCGA... 325 1 99 species 2028029 Rheocricotopus sp. DH90 GGCTTGATCCGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA... 1203 2 100 species 1592914 Caenis pusilla GGCTTGATCCGGAATGCTGGGCACCTCTCTAAGCCTTCTAATTCGT... 1443 3 100 species 2028029 Rheocricotopus sp. DH90 TGCTTGATCAGGAATAGTAGGAACTTCTTTAAGAATTCTAATTCGA... 2298 4 90 family 7149 Chironomidae TGCTTGATCAGGGATAGTGGGAACTTCTTTAAGAATTCTTATTCGA... 2498 5 100 species 189839 Baetis rhodani TGCTTGGGCAGGTATGGTAGGTACCTCATTAAGACTTTTAATTCGA... 2610""" ltg_db_df = pandas.DataFrame.from_records(ltg_from_db_list) ltg_db_df = ltg_db_df.reindex(sorted(ltg_db_df.columns), axis=1) # sort columns ####################################################################### # # Get list of variants (id and sequence) that need blast for tax assignation # ####################################################################### stmt_variant = sqlalchemy.select([variant_declarative_table.c.id, variant_declarative_table.c.sequence]) \ .where(variant_declarative_table.c.sequence.in_(variant_sequence_list)) \ if ltg_db_df.shape[0] > 0: stmt_variant = stmt_variant.where( variant_declarative_table.c.id.notin_( ltg_db_df.variant_id.tolist())) stmt_variant = stmt_variant.distinct().order_by("id") variant_not_tax_assigned = [] with engine.connect() as conn: for row in conn.execute(stmt_variant).fetchall(): variant_not_tax_assigned.append( dict(zip(row.keys(), row.values()))) ####################################################################### # # Run RunnerTaxAssign for variant_not_tax_assigned # ####################################################################### blast_variant_df = pandas.DataFrame() ltg_blast_df = pandas.DataFrame() if len(variant_not_tax_assigned ) > 0: # Run blast for variants that need tax assignation blast_variant_df = pandas.DataFrame.from_records( variant_not_tax_assigned, index='id') taxonomy = Taxonomy(tsv=taxonomy_tsv) sequence_list = blast_variant_df.sequence.tolist() tax_assign_runner = RunnerTaxAssign(sequence_list=sequence_list, taxonomy=taxonomy, blast_db_dir=blastdb_dir_path, blast_db_name=blastdbname_str, num_threads=num_threads, params=None) ltg_blast_df = tax_assign_runner.ltg_df ###################################################### # Uncomment to debug because blast is slow # pandas.to_pickle(ltg_df, "ltg_df.pkl") # ltg_df = pandas.read_pickle("ltg_df.pkl") # import pdb; pdb.set_trace() ###################################################### ltg_blast_df.rename({'variant_id': 'sequence'}, inplace=True, axis=1) ltg_blast_df = blast_variant_df.merge(ltg_blast_df, on='sequence', how='outer') ltg_blast_df['blast_db'] = blastdbname_str ltg_blast_df = ltg_blast_df.reindex(sorted(ltg_blast_df.columns), axis=1) # sort columns del (blast_variant_df) ####################################################################### # # Concatenate tax-assigned variants from DB and from Blast # Merge variant_df and ltg_df and write to DB # ####################################################################### if ltg_db_df.shape[0] > 0 and ltg_blast_df.shape[0] > 0: ltg_df = pandas.concat([ ltg_db_df[[ "blast_db", "identity", "ltg_rank", "ltg_tax_id", "ltg_tax_name", "sequence" ]], ltg_blast_df ], axis=0) elif ltg_db_df.shape[0] > 0: ltg_df = ltg_db_df.copy() elif ltg_blast_df.shape[0] > 0: ltg_df = ltg_blast_df.copy() del (ltg_blast_df) ####################################################################### # # Insert or update variant and taxassign tables # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Insert variant_id, ltg_tax_id, ltg_rank to DB" .format(__file__, inspect.currentframe().f_lineno)) for ltg_row in ltg_df.itertuples(): variant_sequence = ltg_row.sequence with engine.connect() as conn: variant_id = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first()[0] select_row = conn.execute( sqlalchemy.select([ TaxAssign ]).where(tax_assign_declarative_table.c.variant_id == variant_id)).first() # import pdb; pdb.set_trace() if select_row is None: # variant_id IS NOT in the database, so INSERT it ltg_row_dic = ltg_row._asdict() ltg_row_dic['variant_id'] = variant_id conn.execute(tax_assign_declarative_table.insert(), dict(ltg_row_dic)) else: # variant_sequence IS in the database, so update row tax_assign_declarative_table.update().where( tax_assign_declarative_table.c.variant_id == variant_id).values() ####################################################################### # # Update LTGs for variant output file # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Update LTGs for variant output file".format( __file__, inspect.currentframe().f_lineno)) variant_output_df = variant_input_df.copy() del (variant_input_df) # Add ltg columns to variant_df if it do not exist for ltg_df_col in [ 'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity', 'blast_db' ]: if not (ltg_df_col in variant_output_df.columns): variant_output_df[ltg_df_col] = None # Move sequence column to end variant_df_columns = variant_output_df.columns.tolist() variant_df_columns.append( variant_df_columns.pop(variant_df_columns.index('sequence'))) variant_output_df = variant_output_df[variant_df_columns] for variant_row in variant_output_df.itertuples(): # variant_id = variant_row.variant_id variant_sequence = variant_row.sequence with engine.connect() as conn: variant_id = conn.execute( sqlalchemy.select([ variant_declarative_table.c.id ]).where(variant_declarative_table.c.sequence == variant_sequence)).first()[0] select_row = conn.execute( sqlalchemy.select([ TaxAssign.ltg_tax_id, TaxAssign.ltg_tax_name, TaxAssign.ltg_rank, TaxAssign.identity, TaxAssign.blast_db, ]).where(tax_assign_declarative_table.c.variant_id == variant_id)).first() tax_assign_dict = dict( zip([ 'ltg_tax_id', 'ltg_tax_name', 'ltg_rank', 'identity', 'blast_db' ], select_row)) for k in tax_assign_dict: variant_output_df.loc[variant_output_df.sequence == variant_sequence, k] = tax_assign_dict[k] # do not move. required because sometimes tax_id is none variant_output_df = variant_output_df.astype({'ltg_tax_id': 'object'}) ####################################################################### # # Update tax lineages for variant output file # ####################################################################### Logger.instance().debug( "file: {}; line: {}; Update tax lineages for variant output file". format(__file__, inspect.currentframe().f_lineno)) tax_id_list = variant_output_df.ltg_tax_id.unique().tolist( ) # unique list of tax ids tax_lineage = TaxLineage(taxonomic_tsv_path=taxonomy_tsv) tax_lineage_df = tax_lineage.create_lineage_from_tax_id_list( tax_id_list=tax_id_list, tax_name=True) # Merge variant_output_df = variant_output_df.merge(tax_lineage_df, left_on='ltg_tax_id', right_on='tax_id', how='left') variant_output_df.drop('tax_id', axis=1, inplace=True) Logger.instance().debug("file: {}; line: {}; Reorder columns".format( __file__, inspect.currentframe().f_lineno)) # Move sequence column to end variant_df_columns = variant_output_df.columns.tolist() variant_df_columns.append( variant_df_columns.pop(variant_df_columns.index('sequence'))) variant_output_df = variant_output_df[variant_df_columns] Logger.instance().debug("file: {}; line: {}; Write to TSV".format( __file__, inspect.currentframe().f_lineno)) variant_output_df.to_csv(output, sep='\t', index=False, header=True)
def main(fastainfo, fastadir, sorteddir, params=None, num_threads=multiprocessing.cpu_count(), no_reverse=False, tag_to_end=False, primer_to_end=False): Logger.instance().info(f"OPTIONS:\n no_reverse: {not no_reverse} \n tag_to_end {not tag_to_end} \n primer_to_end {not primer_to_end}") if sys.platform.startswith('win'): num_threads = 1 ############################################################################################ # # params.yml parameters # ############################################################################################ params_dic = FileParams(params).get_params_dic() cutadapt_error_rate = params_dic['cutadapt_error_rate'] cutadapt_minimum_length = params_dic['cutadapt_minimum_length'] cutadapt_maximum_length = params_dic['cutadapt_maximum_length'] ############################################################################################ # # Loop over tag and primer pairs to demultiplex and trim reads # ############################################################################################ merged_fastainfo_df = FileSampleInformation(fastainfo).read_tsv_into_df() pathlib.Path(sorteddir).mkdir(parents=True, exist_ok=True) tempdir = PathManager.instance().get_tempdir() merged_fasta_list = [] results_list = [] sample_info = {} # make sure every file is analysed once. for i in range(merged_fastainfo_df.shape[0]): if merged_fastainfo_df.iloc[i].mergedfasta not in merged_fasta_list: merged_fasta_list.append(merged_fastainfo_df.iloc[i].mergedfasta) for mergedfasta in merged_fasta_list: inputFiles = FilesInputCutadapt(fastainfo, mergedfasta, no_reverse, tag_to_end) tagFile_path = inputFiles.tags_file() info = inputFiles.get_df_info() for key in info.keys(): if key in sample_info.keys(): sample_info[key] = sample_info[key] + info[key] else: sample_info[key] = info[key] Logger.instance().debug("Analysing FASTA file: {}".format(mergedfasta)) in_raw_fasta_path = os.path.join(fastadir, mergedfasta) ######################################################################################## # # cutadapt --cores=0 -e 0 --no-indels --trimmed-only -g tagFile:$tagfile # --overlap length -o "tagtrimmed.{name}.fasta" in_raw_fasta_path # ######################################################################################## base = os.path.basename(in_raw_fasta_path) base, base_suffix = base.split('.', 1) out_fasta_path = os.path.join(tempdir, "sorted") cmd_cutadapt_tag_dic = { 'in_fasta_path': in_raw_fasta_path, 'out_fasta': out_fasta_path, 'num_threads': num_threads, 'tagFile': tagFile_path, 'base_suffix': base_suffix, } cmd_cutadapt_tag_str = 'cutadapt --cores={num_threads} --no-indels --error-rate 0 --trimmed-only ' \ '-g file:{tagFile} --output {out_fasta}_{{name}}.{base_suffix} {in_fasta_path}' \ .format(**cmd_cutadapt_tag_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_tag_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_tag_str else: args = shlex.split(cmd_cutadapt_tag_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) inputFiles.remove_tags_file() ######################################################################################## # # Trim primers from output # cutadapt --quiet --cores=0 -e trim_error --no-indels --trimmed-only # --minimum-length minimum_length --maximum-length maximum_length # --output input_path + {name} + suffix outputfile # ######################################################################################## primers = inputFiles.primers() try: tags_samples = inputFiles.get_sample_names() except Exception as e: Logger.instance().error(e) return for primer in primers: marker, primerfwd, primerrev, lenprimerfwd, lenprimerrev = primer for tag_sample in tags_samples: name, run, marker2, sample, replicate, _, _ = tag_sample if marker not in marker2: continue in_fasta_path = out_fasta_path + "_" + name + "." + base_suffix baseMerge = mergedfasta.split(".")[0] outname = run + "_" + marker + "_" + sample + "_" + replicate + "_" + baseMerge + "_trimmed" if name.endswith("_reversed"): outname = outname + "_reversed" out_fasta_path_new = os.path.join(tempdir, outname + "." + base_suffix) results_list.append(out_fasta_path_new) if not "_reversed" in name: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerrev, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerrev).reverse_complement()) primerFwd = primerfwd lenPrimerFwd = lenprimerfwd lenPrimerRev = lenprimerrev else: if generic_dna: # Biopython <1.78 primerRev = str(Seq(primerfwd, generic_dna).reverse_complement()) else: # Biopython =>1.78 primerRev = str(Seq(primerfwd).reverse_complement()) primerFwd = primerrev lenPrimerFwd = lenprimerrev lenPrimerRev = lenprimerfwd cmd_cutadapt_primer_dic = { 'in_fasta_path': in_fasta_path, 'out_fasta': out_fasta_path_new, 'error_rate': cutadapt_error_rate, 'num_threads': num_threads, 'primerFwd': primerFwd, 'primerRev': primerRev, 'lenPrimerFwd': lenPrimerFwd, 'lenPrimerRev': lenPrimerRev, 'read_min_length': cutadapt_minimum_length, 'read_max_length': cutadapt_maximum_length, } if not primer_to_end: #works if the command is selected cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "^{primerFwd}...{primerRev}$" --output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) else: cmd_cutadapt_primer_str = 'cutadapt --cores={num_threads} --no-indels --error-rate {error_rate} ' \ '--minimum-length {read_min_length} --maximum-length {read_max_length} ' \ '--trimmed-only -g "{primerFwd};min_overlap={lenPrimerFwd}...{primerRev};min_overlap={lenPrimerRev}" '\ '--output {out_fasta} {in_fasta_path}'\ .format(**cmd_cutadapt_primer_dic) Logger.instance().debug("Running: {}".format(cmd_cutadapt_primer_str)) if sys.platform.startswith("win"): args = cmd_cutadapt_primer_str else: args = shlex.split(cmd_cutadapt_primer_str) run_result = subprocess.run(args=args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) Logger.instance().info(run_result.stdout.decode()) ################################################################### # # Reverse complement back rc fasta and pool # ################################################################### for file in results_list: if "_trimmed" in file: out_final_fasta_path = os.path.join(sorteddir, os.path.split(file)[-1]) in_fasta_path = os.path.join(tempdir, file) if out_final_fasta_path.endswith(".gz"): _open = partial(gzip.open) elif out_final_fasta_path.endswith(".bz2"): _open = partial(bz2.open) else: _open = open if in_fasta_path.endswith(".gz"): _open2 = partial(gzip.open) elif in_fasta_path.endswith(".bz2"): _open2 = partial(bz2.open) else: _open2 = open if "_reversed" in file: Logger.instance().debug("Pooling fwd and rc reads...") out_final_fasta_path = out_final_fasta_path.replace("_reversed", "") with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): if not line.startswith('>'): if generic_dna: # Biopython <1.78 fout.write("%s\n" % str( Seq(line.strip(), generic_dna).reverse_complement())) else: # Biopython =>1.78 fout.write("%s\n" % str( Seq(line.strip()).reverse_complement())) else: fout.write(line) else: with _open(out_final_fasta_path, 'at') as fout: with _open2(in_fasta_path, 'rt') as fin: for line in fin.readlines(): fout.write(line) results_list = [os.path.split(result)[-1] for result in results_list if "_reversed" not in result] del sample_info['mergedfasta'] del sample_info['primerrev'] del sample_info['primerfwd'] del sample_info['tagrev'] del sample_info['tagfwd'] sample_info['sortedfasta'] = results_list sample_info_df = pandas.DataFrame(sample_info) fasta_trimmed_info_tsv = os.path.join(sorteddir, 'sortedinfo.tsv') sample_info_df.to_csv(fasta_trimmed_info_tsv, sep="\t", header=True, index=False)