def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file paths known_occurrences_tsv = self.input_file( OptimizePCRerror.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizePCRerror.__input_file_sortedinfo) # # Output file paths output_optimize_path = self.output_file( OptimizePCRerror.__output_file_optimize_pcr_error) ############################################################################################ # # Get nijk_df, known_occurrences_df # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_pcr_error_runner = RunnerOptimizePCRerror( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_pcr_error_runner.to_tsv(optimize_path=output_optimize_path, engine=engine)
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output known_occurrences_tsv = self.input_file( OptimizeLFNsampleReplicate.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizeLFNsampleReplicate.__input_file_sortedinfo) # Output file output output_optimize_path = self.output_file( OptimizeLFNsampleReplicate. __output_file_optimize_lfn_sample_replicate) ############################################################################################ # # Get nijk_df and known_occurrences_df (keep) # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) known_occurrences_df = known_occurrences_df.loc[ (known_occurrences_df.mock == 1) & (known_occurrences_df.action == 'keep'), ] ############################################################################################ # # Run optimizer and Write # ############################################################################################ optimize_lfn_sample_replicate_runner = RunnerOptimizeLFNsampleReplicate( variant_read_count_df=variant_read_count_df, known_occurrences_df=known_occurrences_df) optimize_lfn_sample_replicate_runner.to_tsv( optimize_path=output_optimize_path, engine=engine)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # Input file output fasta_info_tsv = self.input_file(FilterChimera.__input_file_sortedinfo) # # Input table models # Variant = self.input_table(FilterChimera.__input_table_Variant) input_filter_pcr_error_model = self.input_table( FilterChimera.__input_table_filter_pcr_error) # # Output table models output_filter_chimera_model = self.output_table( FilterChimera.__output_table_filter_chimera) output_filter_borderline_model = self.output_table( FilterChimera.__output_table_filter_chimera_borderline) # # Params uchime3_denovo_abskew = self.option("uchime3_denovo_abskew") ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_chimera_model) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_borderline_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_pcr_error_model, engine=engine) filter_chimera_runner = RunnerFilterChimera( variant_read_count_df=variant_read_count_df) filter_output_chimera_df, filter_borderline_output_df = \ filter_chimera_runner.get_variant_read_count_delete_df( variant_df=variant_df, uchime3_denovo_abskew=uchime3_denovo_abskew) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(filter_output_chimera_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_chimera_model) DataframeVariantReadCountLike(filter_borderline_output_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_borderline_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if filter_output_chimera_df.filter_delete.sum( ) == filter_output_chimera_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() # # Input file output fasta_info_tsv = self.input_file( ReadCountAverageOverReplicates.__input_file_sortedinfo) # codon_stop_model = self.input_table( ReadCountAverageOverReplicates.__input_table_filter_codon_stop) # # Output table models consensus_model = self.output_table( ReadCountAverageOverReplicates.__output_table_filter_consensus) # ####################################################################### # # # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # # # ####################################################################### # # # fasta_info_tsv = FastaInformationTSV(engine=engine, fasta_info_tsv=input_file_sortedinfo) # sample_info_tsv_obj = FileSampleInformation(tsv_path=input_file_sortedinfo) # # ####################################################################### # # # # 2. Delete /run_name/markersamples/replicate from this filter table # # # ####################################################################### # # with engine.connect() as conn: # # # conn.execute(consensus_model.__table__.delete(), sample_instance_list) # # conn.execute(consensus_model.__table__.delete(), sample_instance_list) # # # variant_read_count_like_utils = ModelVariantReadCountLike( # variant_read_count_like_model=consensus_model, engine=engine) # sample_record_list = sample_info_tsv_obj.to_identifier_df( # engine=engine).to_dict('records') # variant_read_count_like_utils.delete_from_db( # sample_record_list=sample_record_list) # # ####################################################################### # # # # 3. Select marker_name/run_name/sample/replicate from variant_read_count_model # # # ####################################################################### # # nijk_df = sample_info_tsv_obj.get_nijk_df( # variant_read_count_like_model=codon_stop_model, filter_id=None) # # # Exit if no variants for analysis # try: # assert nijk_df.shape[0] > 0 # except AssertionError: # sys.stderr.write( # "Error: No variants available for this filter: {}".format( # os.path.basename(__file__))) # sys.exit(1) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=consensus_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=codon_stop_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_read_count_delete_df = read_count_average_over_replicates( variant_read_count_df) ####################################################################### # # Write to DB # ####################################################################### record_list = ModelVariantReadCountLike.filter_delete_df_to_dict( variant_read_count_delete_df) with engine.connect() as conn: # Insert new instances conn.execute(consensus_model.__table__.insert(), record_list) ####################################################################### # # Touch output tables, to update modification date # ####################################################################### for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit()
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # Wrapper inputs, outputs and parameters # ####################################################################### # # Input files fasta_info_tsv = self.input_file( FilterMinReplicateNumber.__input_file_sortedinfo) # # Input tables input_filter_lfn_model = self.input_table( FilterMinReplicateNumber.__input_table_variant_filter_lfn) # # Options min_replicate_number = self.option("min_replicate_number") # input_filter_lfn = self.option("input_filter_lfn") # # Output tables output_filter_min_replicate_model = self.output_table( FilterMinReplicateNumber.__output_table_filter_min_replicate_number) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) filter_id = None if input_filter_lfn_model.__tablename__ == "FilterLFN": filter_id = 8 # Variant pass all filters LFN variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_lfn_model, engine=engine, filter_id=filter_id) ####################################################################### # # 4. Run Filter # ####################################################################### variant_read_count_delete_df = RunnerFilterMinReplicateNumber( variant_read_count_df) .get_variant_read_count_delete_df(min_replicate_number) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_min_replicate_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by( id=obj.id).update({'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception( "This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file fasta_info_tsv = self.input_file( FilterRenkonen.__input_file_sortedinfo) # # Input table models input_filter_chimera_model = self.input_table( FilterRenkonen.__input_table_chimera) # # Options renkonen_distance_quantile = float( self.option("renkonen_distance_quantile")) # # Output table models output_filter_renkonen_model = self.output_table( FilterRenkonen.__output_table_filter_renkonen) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_chimera_model, engine=engine, filter_id=None) ############################################################################################ # # Run per run_id, marker_id # ############################################################################################ variant_read_count_delete_df = pandas.DataFrame() run_marker_df = variant_read_count_df[['run_id', 'marker_id']].drop_duplicates() for row in run_marker_df.itertuples(): run_id = row.run_id marker_id = row.marker_id variant_read_count_per_run_marker_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id)] if variant_read_count_per_run_marker_df.replicate.unique( ).shape[0] > 1: # if more than one replicate filter_renkonen_runner_obj = RunnerFilterRenkonen( variant_read_count_per_run_marker_df) filter_output_i_df = filter_renkonen_runner_obj.get_variant_read_count_delete_df( renkonen_distance_quantile) else: # Just one replicate filter_output_i_df = variant_read_count_df.copy() filter_output_i_df['filter_delete'] = False variant_read_count_delete_df = pandas.concat( [variant_read_count_delete_df, filter_output_i_df], axis=0) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ############################################################################################ DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_renkonen_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() ####################################################################### # # 1. Wrapper inputs, outputs and parameters # ####################################################################### # Input file fasta_info_tsv = self.input_file(MakeAsvTable.__input_file_sortedinfo) # Output file asvtable_tsv_path = self.output_file(MakeAsvTable.__output_table_asv) # # Options cluster_identity = float(self.option("cluster_identity")) known_occurrences_tsv = str(self.option("known_occurrences")) ####################################################################### # # Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # Compute variant_read_count_input_df and other dfs for the asv_table_runner # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( FilterCodonStop, engine=engine) ############################################################################################ # # FileKnownOccurrences # ############################################################################################ if known_occurrences_tsv == 'None' or known_occurrences_tsv is None: known_occurrences_df = None else: known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) known_occurrences_df = known_occurrences_df.loc[ (known_occurrences_df.mock == 1) & (known_occurrences_df.action == 'keep'), ] ####################################################################### # # Compute variant_to_chimera_borderline_df # ####################################################################### sample_list = sample_info_tsv_obj.read_tsv_into_df( )['sample'].drop_duplicates(keep='first').tolist() asvtable_runner = RunnerAsvTable( variant_read_count_df=variant_read_count_df, engine=engine, sample_list=sample_list, cluster_identity=cluster_identity, known_occurrences_df=known_occurrences_df) asvtable_runner.to_tsv(asvtable_tsv_path)
def run(self): """ Algorithm (Updated Oct 13, 2019) 1. Read file with known variants (Mock/tolerate, delete and real) 2. Control if user variants and sequence are consistent in the database 3. Get variant_read_count of this run_name-marker_name-sample-replicate experiment 5. Compute maximal lfn_nijk_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm 6. Compute maximal lfn_variant_cutoff that keeps all 'keep' variants with the 'run_lfn_read_count_and_lfn_variant' algorithm (See below) 7. Loop between default and lfn_nijk_cutoff and run_lfn_read_count_and_lfn_variant parameters 7.1 Compute number of keep variants. Should be always maximal. 7.2 Compute number of delete variants Should decrease. 8. Compute variant(-replicate) specific cutoff for delete variants 8.1 For each variant i (Or variant-replicate i-k ), get N_ijk_max and use it to computer variant specific cutoff Description of the 'run_lfn_read_count_and_lfn_variant' algorithm 1. Remove if does not pass these filter 1.1 Filter lfn_variant (Or lfn_variant_replicate) 1.2 Filter lfn_sample_replicate 1.3 Filter absolute read count 2. Filter if not min replicate number """ session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output known_occurrences_tsv = self.input_file( OptimizeLFNreadCountAndLFNvariant.__input_file_known_occurrences) fasta_info_tsv = self.input_file( OptimizeLFNreadCountAndLFNvariant.__input_file_sortedinfo) # Output file output output_file_optimize_lfn_tsv = self.output_file( OptimizeLFNreadCountAndLFNvariant. __output_file_optimize_lfn_read_count_and_lfn_variant) output_file_lfn_variant_specific_cutoff_tsv = self.output_file( OptimizeLFNreadCountAndLFNvariant. __output_file_optimize_lfn_variant_specific) # Options lfn_ni_cutoff = self.option("lfn_variant_cutoff") lfn_nik_cutoff = self.option("lfn_variant_replicate_cutoff") min_replicate_number = self.option("min_replicate_number") lfn_njk_cutoff = self.option("lfn_sample_replicate_cutoff") lfn_nijk_cutoff = int(self.option("lfn_read_count_cutoff")) filter_kwargs = { "lfn_ni_cutoff": lfn_ni_cutoff, "lfn_nik_cutoff": lfn_nik_cutoff, "lfn_njk_cutoff": lfn_njk_cutoff, "lfn_nijk_cutoff": lfn_nijk_cutoff, 'min_replicate_number': min_replicate_number, } ############################################################################################ # # Get nijk_df and known_occurrences_df (keep) # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) nijk_df = sample_info_tsv_obj.get_nijk_df(VariantReadCount, engine=engine) known_occurrences_df = FileKnownOccurrences( known_occurrences_tsv).to_identifier_df(engine) ############################################################################################ # # Create cutoff values lists # ############################################################################################ # # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1)) # lfn_nijk_cutoff_list = range(lfn_nijk_cutoff, lfn_nijk_cutoff_global_max + 1, round(int((lfn_nijk_cutoff_global_max - lfn_nijk_cutoff + 1)/10), -1)) # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10) # lfn_nijk_cutoff_list = RunnerOptimizeLFNreadCountAndVariantRunMarker.get_lfn_nijk_cutoff_lst(start=lfn_nijk_cutoff, stop=lfn_nijk_cutoff_global_max, nb_points=10) # if lfn_nik_cutoff is None: # lfn_variant optimization # lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)] # else: # lfn_variant_replicate optimization # lfn_ni_nik_cutoff_list = [round(x, 3) for x in numpy.arange(lfn_ni_cutoff, lfn_ni_njk_cutoff_global_max + 0.001, (lfn_ni_njk_cutoff_global_max - lfn_ni_cutoff + 0.001)/10)] ############################################################################################ # # Group and run_name this genetic_code by run_name/marker_name combination # Loop by run_name/marker_name # ############################################################################################ optim_lfn_readcount_variant_runner = RunnerOptimizeLFNreadCountAndVariant( nijk_df=nijk_df, known_occurrences_df=known_occurrences_df) out_optimize_df, out_optimize2_df = optim_lfn_readcount_variant_runner.get_optimize_df( lfn_ni_cutoff=lfn_ni_cutoff, lfn_nik_cutoff=lfn_nik_cutoff, lfn_njk_cutoff=lfn_njk_cutoff, lfn_nijk_cutoff=lfn_nijk_cutoff, min_replicate_number=min_replicate_number) ############################################################################################ # # out_optimize_df: Format and write # ############################################################################################ out_optimize_df.marker_id = NameIdConverter( out_optimize_df.marker_id, engine=engine).to_names(Marker) out_optimize_df.run_id = NameIdConverter(out_optimize_df.run_id, engine=engine).to_names(Run) out_optimize_df.rename({ 'run_id': 'run', 'marker_id': 'marker' }, axis=1, inplace=True) out_optimize_df.to_csv(output_file_optimize_lfn_tsv, header=True, sep='\t', index=False) ############################################################################################ # # out_optimize_df: Format and write # ############################################################################################ out_optimize2_df.marker_id = NameIdConverter( out_optimize2_df.marker_id, engine=engine).to_names(Marker) out_optimize2_df.run_id = NameIdConverter(out_optimize2_df.run_id, engine=engine).to_names(Run) out_optimize2_df['action'] = 'delete' out_optimize2_df['sequence'] = NameIdConverter( out_optimize2_df.variant_id, engine=engine).variant_id_to_sequence() out_optimize2_df.rename( { 'run_id': 'run', 'marker_id': 'marker', 'variant_id': 'variant', 'read_count': 'read_count_max' }, axis=1, inplace=True) if self.option("lfn_variant_replicate_cutoff") is None: out_optimize2_df = out_optimize2_df[[ 'run', 'marker', 'variant', 'action', 'read_count_max', 'N_i', 'lfn_variant_cutoff', 'sequence' ]] else: out_optimize2_df = out_optimize2_df[[ 'run', 'marker', 'variant', 'replicate', 'action', 'read_count_max', 'N_ik', 'lfn_variant_replicate_cutoff', 'sequence' ]] out_optimize2_df.to_csv(output_file_lfn_variant_specific_cutoff_tsv, header=True, sep='\t', index=False)
def run(self): session = self.session engine = session._session().get_bind() ########################################################## # # Wrapper inputs, outputs and parameters # ########################################################## # # Input file output fasta_info_tsv = self.input_file(FilterCodonStop.__input_file_sortedinfo) # # Input table models input_filter_indel_model = self.input_table( FilterCodonStop.__input_table_filter_indel) # # Options genetic_code = int(self.option("genetic_code")) skip_filter_codon_stop = bool(int(self.option("skip_filter_codon_stop"))) # # Output table models output_filter_codon_stop_model = self.output_table( FilterCodonStop.__output_table_filter_codon_stop) ####################################################################### # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ####################################################################### sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_codon_stop_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_indel_model, engine=engine, filter_id=None) ####################################################################### # # 4. Run Filter # ####################################################################### variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_indel_model, engine=engine) variant_read_count_delete_df = RunnerFilterCodonStop( variant_read_count_df=variant_read_count_df).get_variant_read_count_delete_df( variant_df=variant_df, genetic_code=genetic_code, skip_filter_codon_stop=skip_filter_codon_stop) ####################################################################### # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_codon_stop_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by( id=obj.id).update({'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception( "This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # Input file output fasta_info_tsv = self.input_file(FilterLFN.__input_file_sortedinfo) # # Input table models input_variant_read_count_model = self.input_table( FilterLFN.__input_table_variant_read_count) # # Output table models output_filter_lfn_model = self.output_table( FilterLFN.__output_table_filter_lfn) # # Options lfn_variant_cutoff = self.option("lfn_variant_cutoff") lfn_variant_specific_cutoff = self.option( "lfn_variant_specific_cutoff") lfn_variant_replicate_cutoff = self.option( "lfn_variant_replicate_cutoff") lfn_variant_replicate_specific_cutoff = self.option( "lfn_variant_replicate_specific_cutoff") lfn_sample_replicate_cutoff = self.option( "lfn_sample_replicate_cutoff") lfn_read_count_cutoff = self.option("lfn_read_count_cutoff") ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_lfn_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_variant_read_count_model, engine=engine, filter_id=None) lfn_variant_specific_cutoff_df = None if (not (lfn_variant_cutoff is None) ) and pathlib.Path(lfn_variant_specific_cutoff).stat().st_size > 0: lfn_variant_specific_cutoff_df = FileCutoffSpecific( lfn_variant_specific_cutoff).to_identifier_df( engine=engine, is_lfn_variant_replicate=False) lfn_variant_replicate_specific_cutoff_df = None if (not (lfn_variant_replicate_cutoff is None)) and pathlib.Path( lfn_variant_replicate_specific_cutoff).stat().st_size > 0: lfn_variant_replicate_specific_cutoff_df = FileCutoffSpecific( lfn_variant_replicate_specific_cutoff).to_identifier_df( engine=engine, is_lfn_variant_replicate=True) ############################################################################################ # # Create filter object and run_name # ############################################################################################ variant_read_count_delete_df = RunnerFilterLFN( variant_read_count_df).get_variant_read_count_delete_df( lfn_variant_cutoff=lfn_variant_cutoff, lfn_variant_specific_cutoff=lfn_variant_specific_cutoff_df, lfn_variant_replicate_cutoff=lfn_variant_replicate_cutoff, lfn_variant_replicate_specific_cutoff= lfn_variant_replicate_specific_cutoff_df, lfn_sample_replicate_cutoff=lfn_sample_replicate_cutoff, lfn_read_count_cutoff=lfn_read_count_cutoff) DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_lfn_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)
def run(self): session = self.session engine = session._session().get_bind() this_temp_dir = os.path.join(PathManager.instance().get_tempdir(), os.path.basename(__file__)) pathlib.Path(this_temp_dir).mkdir(exist_ok=True) ############################################################################################ # # Wrapper inputs, outputs and parameters # ############################################################################################ # # Input file output fasta_info_tsv = self.input_file( FilterPCRerror.__input_file_sortedinfo) # # Input table models input_filter_min_replicate_model = self.input_table( FilterPCRerror.__input_table_filter_min_replicate_number) # # Options pcr_error_var_prop = self.option("pcr_error_var_prop") # # Output table models output_filter_pcr_error_model = self.output_table( FilterPCRerror.__output_table_filter_pcr_error) ############################################################################################ # # 1. Read sortedinfo to get run_id, marker_id, sample_id, replicate for current analysis # 2. Delete marker_name/run_name/sample/replicate from variant_read_count_model # 3. Get nijk_df input # ############################################################################################ sample_info_tsv_obj = FileSampleInformation(tsv_path=fasta_info_tsv) sample_info_tsv_obj.delete_from_db( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) variant_read_count_df = sample_info_tsv_obj.get_nijk_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine, filter_id=None) ############################################################################################ # # Run per sample_id # ############################################################################################ variant_df = sample_info_tsv_obj.get_variant_df( variant_read_count_like_model=input_filter_min_replicate_model, engine=engine) record_list = [] run_marker_sample_df = variant_read_count_df[[ 'run_id', 'marker_id', 'sample_id' ]].drop_duplicates() for row in run_marker_sample_df.itertuples(): run_id = row.run_id marker_id = row.marker_id sample_id = row.sample_id # Get variant read for the current run-marker-sample variant_read_count_per_sample_df = variant_read_count_df.loc[ (variant_read_count_df.run_id == run_id) & (variant_read_count_df.marker_id == marker_id) & (variant_read_count_df.sample_id == sample_id)] variant_per_sample_df = variant_df.loc[variant_df.index.isin( variant_read_count_per_sample_df.variant_id.unique().tolist())] this_step_tmp_per_sample_dir = os.path.join( this_temp_dir, "run_{}_marker_{}_sample{}".format(run_id, marker_id, sample_id)) pathlib.Path(this_step_tmp_per_sample_dir).mkdir(exist_ok=True) ######################################################################################## # # Run vsearch and get alignement variant_read_count_input_df # ######################################################################################## filter_pcr_error_runner = RunnerFilterPCRerror( variant_expected_df=variant_per_sample_df, variant_unexpected_df=variant_per_sample_df, variant_read_count_df=variant_read_count_per_sample_df) filter_output_per_sample_df = filter_pcr_error_runner.get_variant_read_count_delete_df( pcr_error_var_prop) ######################################################################################## # # Per sample add to record list # ######################################################################################## record_per_sample_list = ModelVariantReadCountLike.filter_delete_df_to_dict( filter_output_per_sample_df) record_list = record_list + record_per_sample_list variant_read_count_delete_df = pandas.DataFrame.from_records( data=record_list) ############################################################################################ # # 5. Write to DB # 6. Touch output tables, to update modification date # 7. Exit vtam if all variants delete # ####################################################################### DataframeVariantReadCountLike(variant_read_count_delete_df).to_sql( engine=engine, variant_read_count_like_model=output_filter_pcr_error_model) for output_table_i in self.specify_output_table(): declarative_meta_i = self.output_table(output_table_i) obj = session.query(declarative_meta_i).order_by( declarative_meta_i.id.desc()).first() session.query(declarative_meta_i).filter_by(id=obj.id).update( {'id': obj.id}) session.commit() if variant_read_count_delete_df.filter_delete.sum( ) == variant_read_count_delete_df.shape[0]: Logger.instance().warning( VTAMexception("This filter has deleted all the variants: {}. " "The analysis will stop here.".format( self.__class__.__name__))) sys.exit(0)