def sort_results(rvcf_input_file_path, taxon_table_file_path, transform, r_sqr_median_cutoff, stability_cutoff, snp_count, no_tables, extra_columns): print('plotting {} SNPs from {}'.format(snp_count, rvcf_input_file_path)) # read the rvcf file and sort by rsq_median df = pd.read_csv(rvcf_input_file_path, sep='\t') #print('df.shape: {}'.format(df.shape)) sorted_rsq_best_medians_df = df.sort_values(by='rsq_median', ascending=False) x_df = sorted_rsq_best_medians_df[sorted_rsq_best_medians_df.rsq_median > r_sqr_median_cutoff] print('{} SNPs with r_sqr > {:5.3f}'.format(x_df.shape[0], r_sqr_median_cutoff)) taxon_table_df = read_taxon_file(taxon_table_file_path, transform=transform) for row_i in range(sorted_rsq_best_medians_df.shape[0]): if row_i >= snp_count: break else: # get a 1-row dataframe snp_df = sorted_rsq_best_medians_df.iloc[[row_i]] aligned_snp_df, aligned_taxa_df = align_snp_and_taxa( snp_df, taxon_table_df ) # get the taxon stability selection scores # use the taxon table df index to get column names for snp_df taxon_scores_df = snp_df.loc[:, taxon_table_df.index].transpose() sorted_taxon_scores_df = taxon_scores_df.sort_values(by=taxon_scores_df.columns[0], ascending=False) #sorted_taxon_scores_df = taxon_scores_df.sort(taxon_scores_df.columns[0], ascending=False) p_df_list = [] print('{} {} {:5.3f}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID, snp_df.iloc[0].rsq_median)) summary_line = '{}\t{}\t'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID) for i, (selected_taxon, selected_taxon_row) in enumerate(sorted_taxon_scores_df.iterrows()): # use selected_taxon_row.index[0] to index the first and only column selected_taxon_score = selected_taxon_row.iloc[0] if selected_taxon_score < stability_cutoff: #print('done with selected taxa') break else: # trim 'Root;' from the front of the taxon name if selected_taxon.startswith('Root;'): taxon_name = selected_taxon[5:] else: taxon_name = selected_taxon print(' {:5.3f} {}'.format(selected_taxon_score, taxon_name)) summary_line += '{}, '.format(taxon_name) gts = [ snp_df.iloc[0].REF + snp_df.iloc[0].REF, # 0 snp_df.iloc[0].REF + snp_df.iloc[0].ALT, # 1 snp_df.iloc[0].ALT + snp_df.iloc[0].ALT # 2 ] aligned_snp_value_list = aligned_snp_df.values.flatten().tolist() data_dict = { 'chromosome': [snp_df.iloc[0].CHROM] * aligned_snp_df.shape[1], 'snp_id': [snp_df.iloc[0].ID] * aligned_snp_df.shape[1], 'gene': [snp_df.iloc[0].GENE] * aligned_snp_df.shape[1], 'taxon': [selected_taxon] * aligned_snp_df.shape[1], 'abundance': aligned_taxa_df[selected_taxon].values.tolist(), 'variant_allele_count': [str(int(v)) for v in aligned_snp_value_list], 'genotype': [gts[int(v)] for v in aligned_snp_value_list], 'sample_id' : aligned_snp_df.columns } columns_to_display = ['abundance', 'variant_allele_count', 'genotype', 'sample_id'] if extra_columns: for extra_column in extra_columns.split(','): data_dict[extra_column] = snp_df.iloc[0][extra_column] columns_to_display.append(extra_column) p_df = pd.DataFrame(data_dict) p_df_list.append(p_df) if no_tables: pass else: p_df[columns_to_display].to_csv( sys.stdout, sep='\t' ) # save a stacked bar plot if len(p_df_list) > 0: file_name = 'stacked_bar_plot_selected_taxa_{}_{}.pdf'.format( snp_df.iloc[0].GENE, snp_df.iloc[0].ID ) p_df = pd.concat(p_df_list, axis=0) # at this point the index for p_df looks like # 0...76.0...76.0...76 # replace the index p_df.index = range(p_df.shape[0]) #p_df.to_csv(file_path, sep='\t') stacked_bar_title = '{}\n{}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID)
def box_bar_lasso_lars_cv_C_stability_selection_features( rvcf_input_file_path, taxon_table_file_path, transform, plot_output_dir_path, stability_cutoff, snp_count): print('plotting {} SNPs from {}'.format(snp_count, rvcf_input_file_path)) if os.path.exists(plot_output_dir_path): # delete it print('deleting old plots') shutil.rmtree(plot_output_dir_path) os.makedirs(plot_output_dir_path) # read the rvcf file and sort by rsq_median df = pd.read_csv(rvcf_input_file_path, sep='\t') sorted_rsq_best_medians_df = df.sort_values(by='rsq_median', ascending=False) taxon_table_df = read_taxon_file(taxon_table_file_path, transform=transform) # these are proxies for R functions taxon_abundance_box_plot = get_taxon_abundance_box_plot() taxon_abundance_stacked_bar_plot = get_taxon_abundance_stacked_bar_plot() for row_i in range(sorted_rsq_best_medians_df.shape[0]): if row_i >= snp_count: break else: # get a 1-row dataframe snp_df = sorted_rsq_best_medians_df.iloc[[row_i]] aligned_snp_df, aligned_taxa_df = align_snp_and_taxa( snp_df, taxon_table_df ) # get the taxon stability selection scores # use the taxon table df index to get column names for snp_df taxon_scores_df = snp_df.loc[:, taxon_table_df.index].transpose() sorted_taxon_scores_df = taxon_scores_df.sort(taxon_scores_df.columns[0], ascending=False) # print all sorted taxon scores to verify they are sorted high to low p_df_list = [] summary_line = '{}\t{}\t'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID) for i, (selected_taxon, selected_taxon_row) in enumerate(sorted_taxon_scores_df.iterrows()): # use selected_taxon_row.index[0] to index the first and only column selected_taxon_score = selected_taxon_row.iloc[0] if selected_taxon_score < stability_cutoff: break else: # trim 'Root;' from the front of the taxon name if selected_taxon.startswith('Root;'): taxon_name = selected_taxon[5:] else: taxon_name = selected_taxon summary_line += '{}, '.format(taxon_name) # print a box plot r_pdf_file_path = \ os.path.join( plot_output_dir_path, 'best_taxa_{}_{}_{}_boxplot_{}.pdf'.format( row_i, snp_df.iloc[0].GENE, snp_df.iloc[0].ID, i ) ) #print('writing file {}'.format(r_pdf_file_path)) gts = [ snp_df.iloc[0].REF + snp_df.iloc[0].REF, # 0 snp_df.iloc[0].REF + snp_df.iloc[0].ALT, # 1 snp_df.iloc[0].ALT + snp_df.iloc[0].ALT # 2 ] aligned_snp_value_list = aligned_snp_df.values.flatten().tolist() p_df = pd.DataFrame({ 'chromosome': [snp_df.iloc[0].CHROM] * aligned_snp_df.shape[1], 'snp_id': [snp_df.iloc[0].ID] * aligned_snp_df.shape[1], 'gene': [snp_df.iloc[0].GENE] * aligned_snp_df.shape[1], 'taxon': [selected_taxon] * aligned_snp_df.shape[1], 'abundance': aligned_taxa_df[selected_taxon].values.tolist(), 'variant_allele_count': [str(int(v)) for v in aligned_snp_value_list], 'gt': [gts[int(v)] for v in aligned_snp_value_list] }) p_df_list.append(p_df) r_df = rpy2.robjects.vectors.DataFrame({ 'abundance': rpy2.robjects.FloatVector(aligned_taxa_df[selected_taxon].values.tolist()), 'variant_allele_count': rpy2.robjects.StrVector([str(int(v)) for v in aligned_snp_value_list]), 'genotype': rpy2.robjects.StrVector([gts[int(v)] for v in aligned_snp_value_list]) }) print(taxon_name) print(r_df) taxon_abundance_box_plot( r_df, r_pdf_file_path, '{} (score: {:4.3f})'.format(snp_df.iloc[0].GENE, selected_taxon_score), '{} {}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID), selected_taxon ) # write a summary line and print(summary_line[:-2]) #summary_file.write(summary_line[:-2]) #summary_file.write('\n') # save a stacked bar plot if len(p_df_list) > 0: file_name = 'stacked_bar_plot_selected_taxa_{}_{}.pdf'.format( snp_df.iloc[0].GENE, snp_df.iloc[0].ID ) stacked_bar_plot_file_path = os.path.join(plot_output_dir_path, file_name) p_df = pd.concat(p_df_list, axis=0) # at this point the index for p_df looks like # 0...76.0...76.0...76 # replace the index p_df.index = range(p_df.shape[0]) r_all_df = rpy2.robjects.vectors.DataFrame({ 'abundance': rpy2.robjects.FloatVector(p_df['abundance'].values.tolist()), 'variant_allele_count': rpy2.robjects.StrVector([str(int(v)) for v in p_df['variant_allele_count'].values]), 'taxon': rpy2.robjects.StrVector(p_df['taxon']), 'gene': rpy2.robjects.StrVector(p_df['gene']), 'genotype': rpy2.robjects.StrVector(p_df['gt']) }) stacked_bar_title = '{}\n{}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID) taxon_abundance_stacked_bar_plot( r_all_df, stacked_bar_plot_file_path, stacked_bar_title, '{} {}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID), 'median abundance' )