def plotScores(data, palette, pdf): """ This function creates a PDF file with 3 scatter plots for the combinations of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. :Arguments: :type data: pandas.core.frame.DataFrame :param data: Data frame with the data to plot. :type outpath: string :param outpath: Path for the output file :type group: string :param group: Name of the column that contains the group information on the design file. :Return: :rtype PDF: file :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. """ for x,y in list(itertools.combinations(data.columns.tolist(),2)): # Creating a figure handler object fh = figureHandler(proj="2d", figsize=(14,8)) # Creating title for the figure title = "{0} vs {1}".format(x,y) # Creating the scatterplot 2D scatter.scatter2D(ax=fh.ax[0], x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist()) # Despine axis fh.despine(fh.ax[0]) fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shinking the plot so everything fits fh.shrink() # Format Axis fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y), grid=False) # Adding figure to pdf fh.addToPdf(dpi=90,pdfPages=pdf)
def main(args): # Loading design if args.design: design = pd.DataFrame.from_csv(args.design,sep="\t") design.reset_index(inplace=True) else: design = False # Loading wide file wide = pd.DataFrame.from_csv(args.input,sep="\t") # Create figureHandler object fh = figureHandler(proj="2d",figsize=(14,8)) # If design file with group and the uniqID is "sampleID" then color by group if args.group and args.uniqID == "sampleID": glist=list(design[args.group]) colorList, ucGroups = palette.getColorsByGroup(design=design, group=args.group,uGroup=sorted(set(glist))) else: glist = list() colorList = palette.mpl_colors[0] ucGroups = dict() # Plote scatterplot 2D scatter.scatter2D(ax=fh.ax[0], x=list(wide[args.x]), y=list(wide[args.y]), colorList=colorList) # Despine axis (spine = tick) fh.despine(fh.ax[0]) # Formating axis fh.formatAxis(figTitle=args.x + " vs " + args.y, xTitle=args.x, yTitle=args.y, grid=False) # If groups are provided create a legend if args.group and args.uniqID == "sampleID": fh.makeLegend(ax=fh.ax[0],ucGroups=ucGroups,group=args.group) fh.shrink() # Saving figure to file with PdfPages(args.figure) as pdfOut: fh.addToPdf(dpi=600, pdfPages=pdfOut) logger.info("Script Complete!")
def plotScatterplot2D(data, palette, pdf, nloads=3): """ Plots Scatterplots 2D for a number of loadngs for PCA. :Arguments: :type data: pandas.DataFrame :param data: Loadings of the PCA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. :type nloads: int :param nloads: Number of principal components to create pairwise combs. """ # Selecting amount of pairwise combinations to plot scaterplots for loads. for x, y in list(combinations(data.columns.tolist()[:nloads],2)): # Create a single-figure figure handler object fh = figureHandler(proj="2d", figsize=(14,8)) # Create a title for the figure title = "{0} vs {1}".format(x,y) # Plot the scatterplot based on data scatter.scatter2D(x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist(), ax=fh.ax[0]) # Create legend fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrink axis to fit legend fh.shrink() # Despine axis fh.despine(fh.ax[0]) # Formatting axis fh.formatAxis(figTitle=title,xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y),grid=False) # Adding figure to pdf fh.addToPdf(dpi=600,pdfPages=pdf)
def makeScatter(x, y, ax, fh): """ Plot a scatter plot of x vs y. :Arguments: :type x: pandas.Series :param x: Series of first sample, treated as independent variable. :type y: pandas.Series :param y: Series of second sample, treated as dependent variables. :type ax: matplotlib.axis :param ax: Axis which to plot. :type fh: figureHandler :param fh: figure to draw BA plots onto. :Returns: :rtype: matplotlib.axis :returns: A matplotlib axis with a scatter plot. """ #logger.info('{0}, {1}'.format(x.name, y.name)) # Get Upper and Lower CI from regression lower, upper, fitted, resid, infl = runRegression(x, y) # Plot scatter scatter.scatter2D(x=x, y=y, ax=ax, colorList=list("b")) # Plot regression lines # If there are missing data, x and the result vectors won't have the same # dimensions. First filter x by the index of the fitted values then plot. x2 = x.loc[fitted.index] lines.drawCutoff(x=x2, y=lower, ax=ax) lines.drawCutoff(x=x2, y=fitted, ax=ax) lines.drawCutoff(x=x2, y=upper, ax=ax) # Adjust plot fh.formatAxis(axnum=0, xTitle=x.name, yTitle=y.name, axTitle='Scatter plot', grid=False)
def qqPlot(tresid, tfit, oname): """ Plot the residual diagnostic plots by sample. Output q-q plot, boxplots and distributions of the residuals. These plots will be used diagnose if residuals are approximately normal. :Arguments: :type tresid: pandas.Series :param tresid: Pearson normalized residuals. (transposed) (residuals / sqrt(MSE)) :type tfit: pandas DataFrame :param tfit: output of the ANOVA (transposed) :type oname: string :param oname: Name of the output file in pdf format. :Returns: :rtype: PDF :returns: Outputs a pdf file containing all plots. """ #Open pdf with PdfPages(oname) as pdf: # Stablishing axisLayout axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)] # Start plotting for col in tresid.columns: #Creating figure fig = figureHandler(proj='2d', numAx=4, numRow=2, numCol=3, arrangement=axisLayout) data = tresid[col].values.ravel() noColors = list() for j in range(0, len(data)): noColors.append('b') #blue df_data = pd.DataFrame(data) # Removing missing so that it will plot correctly. mask_nan_data = np.isnan(data) data = data[~mask_nan_data] # Plot qqplot on axis 0 sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0]) # Plot boxplot on axis 1 box.boxSeries(ser=data, ax=fig.ax[1]) # Plot histogram on axis 2 hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal') # Plot scatterplot on axis 3 scatter.scatter2D(ax=fig.ax[3], x=tfit[col], y=tresid[col], colorList=list('b')) # Draw cutoff line for scatterplot on axis 3 lines.drawCutoffHoriz(ax=fig.ax[3], y=0) # Format axis 0 fig.formatAxis(figTitle=col, axnum=0, grid=False, showX=True, yTitle="Sample Quantiles", xTitle=" ") # Format axis 1 fig.formatAxis(axnum=1, axTitle="Standardized Residuals", grid=False, showX=False, showY=True, xTitle=" ") # Format axis 2 fig.formatAxis(axnum=2, grid=False, showX=True, showY=True, axTitle=" ", xTitle=" ") # Format axis 3 fig.formatAxis(axnum=3, axTitle="Predicted Values vs Residual Values", xTitle="Predicted Values", yTitle="Residual Values", grid=False) #Add figure to pdf fig.addToPdf(pdfPages=pdf)
def main(args): # Loading data trought Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, runOrder=args.order, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. if args.pairing == "unpaired": logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # This variable is useless for unpared test. it just adds extra column to the data frame. if args.order == False: number_of_features = data_frame.shape[1] - 1 else: number_of_features = data_frame.shape[1] - 2 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() else: data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() else: data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() else: data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] ttest_ind_args = [series_first, series_second] p_value[j] = ttest_ind( *ttest_ind_args )[1] t_value[j] = ttest_ind( *ttest_ind_args )[0] # Possible alternative for two groups. # p_value[j] = ttest_ind_args(series_first, series_second)[1] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups. # Each sample in one group should have exacty one matching pair in the other group. # The matching is controlled by args.order variable. if args.pairing == "paired": logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order)) # Getting the number of unique groups. If it is bigger than 2 return the warning and exit. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] if number_of_unique_groups != 2: logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) ) exit() # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. # Creating pairwise combination of our two groups that we will use in the future. groups_pairwise = list( combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # Checking that the requred pairing variable has been provided. if args.order == False: logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.") exit() # This piece of code will be executed only if the args.order has been provided and the check is passed. # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order number_of_features = data_frame.shape[1] - 2 # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided. # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups. # Getting the unique pairs and deleting those theat have more or less than three. pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze() pairid_values_series_unique = pairid_values_series.unique() number_of_unique_pairid = pairid_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting the number of samples in the final frame. number_of_samples = data_frame.shape[0] # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups. # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning. # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed. for i in range(0, number_of_unique_pairid ): # Extracting the pieces of the data frame that belong to ith unique pairid. data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ] )] # We transpose here so it will be easier to operate with. data_frame_current_pairid = data_frame_current_pairid.transpose() sample_names_current_pairid = list(data_frame_current_pairid.columns.values) if data_frame_current_pairid.shape[1] != 2: # Pulling indexes list from the current data frame. logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.shape[1], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes and in the for loop going to next iteration. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2: # Here we are checking if the groupID-s for the given pair are indeed different. elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]: logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # Cheching if the data frame bacame empty after cleaning. if data_frame.shape[0] == 0: logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program." ) exit() # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Performing paired t-test for the two groups and saving the results. # Creating p_values and flag_values emply list of length number_of_features. # This will be used for thw two groups in paired t-test. p_value = [0] * number_of_features t_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features # Performing paired t-test for each pair of features. for j in range(0, number_of_features ): # Extracting the pieces of the data frame that belong to 1st group. data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]] )] # Sorting data frame by args.group index # This will ensure datasets are aligned by pair when fed to the t-test. data_frame_first_group = data_frame_first_group.sort(args.order) data_frame_second_group = data_frame_second_group.sort(args.order) # Sorting data frame by args.group index data_frame_first_group = data_frame_first_group.drop( [args.group,args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose() # Pulling list of indexes. This is the same list for the first and for the second. indexes_list = data_frame_first_group.index.tolist() # Pullinng the samples out series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] # Running t-test for the two given samples paired_ttest_args = [series_first, series_second] p_value[j] = ttest_rel( *paired_ttest_args )[1] t_value[j] = ttest_rel( *paired_ttest_args )[0] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] difference_value_column_name_current = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01' flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05' flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10' summary_df[t_value_column_name_current] = t_value summary_df[p_value_column_name_current] = p_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of t-test.")
def plotDistances(df_distance, palette, plotType, disType, cutoff, p, pdf): #Geting number of samples in dataframe (ns stands for number of samples) ns = len(df_distance.index) #Calculates the widht for the figure base on the number of samples figWidth = max(ns / 2, 16) # Keeping the order on the colors df_distance["colors"] = palette.design["colors"] # Create figure object with a single axis figure = figureHandler(proj='2d', figsize=(figWidth, 8)) # Getting type of distance file if "distance_to_mean" in df_distance.columns: dataType = "to the mean" else: dataType = "pairwise" # Getting ty of distance header if disType == "Mahalanobis": distType1 = "Penalized" distType2 = disType else: distType1 = "Standardized" distType2 = disType # Adds Figure title, x axis limits and set the xticks figure.formatAxis(figTitle="{0} for {1} {2} Distance for {3} {4}".format( plotType, distType1, distType2, df_distance.name, dataType), yTitle="{0} {1} Distance".format(distType1, distType2), xTitle="Index", ylim="ignore", xlim=(-0.5, -0.5 + ns), xticks=df_distance.index) # If distance to mean if dataType == "to the mean": # Plot scatterplot quickplot scatter.scatter2D(ax=figure.ax[0], colorList=df_distance["colors"], x=range(len(df_distance.index)), y=df_distance["distance_to_mean"]) # if pairwise else: if plotType == "Scatterplot": # Plot scatterplot for index in df_distance.index: scatter.scatter2D(ax=figure.ax[0], colorList=df_distance["colors"][index], x=range(len(df_distance.index)), y=df_distance[index]) elif plotType == "Box-plots": # Plot Box plot box.boxDF(ax=figure.ax[0], colors=df_distance["colors"], dat=df_distance) # Shrink figure figure.shrink() # Plot legend figure.makeLegend(figure.ax[0], palette.ugColors, palette.combName) #Add a cutoof line cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0) # Add figure to PDF and close the figure afterwards figure.addToPdf(pdf) # Drop "color" column to no mess the results df_distance.drop("colors", axis=1, inplace=True)
def makeBA(x, y, ax, fh): """ Function to make BA Plot comparing x vs y. :Arguments: :type x: pandas.Series :param x: Series of first sample, treated as independent variable. :type y: pandas.Series :param y: Series of second sample, treated as dependent variables. :type ax: matplotlib.axis :param ax: Axis which to plot. :type fh: figureHandler :param fh: figure to draw BA plots onto. :Returns: :rtype: pandas.Series :returns: A Series containing Boolean values with True indicating a value is more extreme than CI and False indicating a value falls inside CI. """ # Make BA plot x = x.apply(float) y = y.apply(float) diff = x - y mean = (x + y) / 2 # Drop missing for current comparison diff.dropna(inplace=True) mean.dropna(inplace=True) # Get Upper and Lower CI from regression lower, upper, fitted, resid, infl = runRegression(mean, diff) mask1 = abs(resid['resid_pearson']) > cutoff mask2 = infl['cooks_pval'] <= 0.5 mask3 = infl['dffits'] mask = mask1 | mask2 | mask3 # Create BA plot scatter.scatter2D(ax=ax, x=mean[~mask], y=diff[~mask], colorList='b') scatter.scatter2D(ax=ax, x=mean[mask], y=diff[mask], colorList='r') # Plot regression lines ax.plot(mean, lower, 'r:') ax.plot(mean, fitted, 'r') ax.axhline(0, color='k') ax.plot(mean, upper, 'r:') #Adjust axes fh.formatAxis(axnum=1, xlim='ignore', ylim='ignore', axTitle='Bland-Altman Plot', xTitle='Mean\n{0} & {1}'.format(x.name, y.name), yTitle='Difference\n{0} - {1}'.format(x.name, y.name), grid=False) return mask, mask1, mask2, mask3
def main(args): # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default). if args.group != False: logger.info( u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}.""" .format(args.group, args.mu)) # Loading data trought Interface. logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric. dat.wide = dat.wide.applymap(float) # Cleaning from the missing data. dat.dropMissing() # Getting the uinique group values so that we will feed them to the t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. We subtract 1 since we have provided args.group number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with the single sample t-test. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also transpose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries for feature j. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Running single sample t-test for all groups. # We are also computing means for each group and outputting them. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. # Creating p values, difference values, neg_log10_p_value, t-value, flag_value lists filled wiht 0es. means_value = [0] * number_of_features difference_value = [0] * number_of_features p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Performing one sample t-test ttest_1samp_args = [series_current, float(args.mu)] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = means_value[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[ i] + '_' + args.mu t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[ i] + '_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[ i] + '_' + args.mu difference_value_column_name_current = 'diff_of_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[ i] + '_' + args.mu # Adding flag_value column to the data frame and assigning the name. # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame( data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frames (summary and flags) exist so only columns are added to the existing data frame. summary_df[means_value_column_name_current] = means_value summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[ neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default). if args.group == False: logger.info( u"""t-test will be performed for the entire dataset since goruping variable was not provided.""" ) # Loading data trough the interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Saving the number of unique groups that will be used for plotting. # Since we did not feed any grouping variable it is exactly one. number_of_unique_groups = 1 # Extracting data from the interface. data_frame = dat.wide.transpose() # Extracting number of features. We do not subtract 1 since we have not provided args.group number_of_features = data_frame.shape[1] # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with single sample t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features # Creating array of means for the current group that will be filled. # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): # We transpose here so data will be easier to operate on. data_frame_manipulate_transpose = data_frame.transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Performing one sample t-test for the entire dataset. ttest_1samp_args = [ data_frame_manipulate_transpose.loc[indexes_list_complete[j]], float(args.mu) ] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = mean_value_all[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_all' p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu t_value_column_name_current = 't_value_for_diff_all_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu difference_value_column_name_current = 'diff_of_all_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu summary_df[means_value_column_name_current] = mean_value_all summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list_complete) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_unique_groups): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # If no grouping variable is provided. if number_of_unique_groups == 1: current_key = 'all_' + args.mu else: current_key = group_values_series_unique[i] + '_' + args.mu # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of the means from H0 for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) logger.info(u"Volcano plots have been created.") logger.info(u"Finishing running of t-test.")
def plotSignificantROR(data, pdf, palette): """ Plot a scatter plot of x vs y. :Arguments: :type row: :param row: :type pdf: PdfPages :param pdf: pdf object to store scatterplots :type des: pandas DataFrame :param des: design file :type groupName: string :param groupName: name of group """ # Iterates over all rows in the dataframe # Make scatter plot if p-pvalue is less than 0.05 for index, row in data.iterrows(): if row["pval"] > 0.05: continue #plotSignificantROR(row,pdf,dat.design,args.group) # Get 95% CI prstd, lower, upper = wls_prediction_std(row["res"]) # Sort CIs for Plotting toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper}) toPlot.sort_values(by="x", inplace=True) # Create plot fh = figureHandler(proj="2d", figsize=(14, 8)) #Plot scatterplot scatter.scatter2D(ax=fh.ax[0], x=row["x"], y=row["y"], colorList=palette.list_colors) # Plot cutoffs lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r") # Formatting ymin, ymax = fh.ax[0].get_ylim() fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2), figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\ " included)".format(row["name"])) # Shrink figure fh.shrink() # Add legend to figure fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) #Add text to the ax fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\ "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4), round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12) # Save to PDF fh.addToPdf(pdf)
def makePlots(SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors, levels): """ Manage all the plots for this script :Arguments: :type SEDData: pandas.dataFrame :param SEDData: Contains SED data either to Mean or pairwise :type design: pandas.dataFrame :param design: Design file after getColor :type pdf: PDF object :param pdf: PDF for output plots :type groupName: string :param groupName: Name of the group (figure title). :type cutoff: pandas.dataFrame :param cutoff: Cutoff values, beta, chi-sqr and normal. :type p: float :param p: Percentil for cutoff. :type plotType: string :param plotType: Type of plot, the possible types are scatterplot to mean scatterplot pairwise and boxplot pairwise. """ #Geting number of features in dataframe nFeatures = len(SEDData.index) #Calculates the widht for the figure base on the number of features figWidth = max(nFeatures / 2, 16) # Create figure object with a single axis and initiate the figss figure = figureHandler(proj='2d', figsize=(figWidth, 8)) # Keeping the order on the colors SEDData["colors"] = design["colors"] # Choose type of plot # Plot scatterplot to mean if (plotType == "scatterToMean"): #Adds Figure title, x axis limits and set the xticks figure.formatAxis( figTitle= "Standardized Euclidean Distance from samples {} to the mean". format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") #Plot scatterplot quickplot scatter.scatter2D(ax=figure.ax[0], colorList=SEDData["colors"], x=range(len(SEDData.index)), y=SEDData["SED_to_Mean"]) #Plot scatterplot pairwise elif (plotType == "scatterPairwise"): # Adds Figure title, x axis limits and set the xticks figure.formatAxis( figTitle="Pairwise standardized Euclidean Distance from samples {}" .format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot scatterplot for index in SEDData.index.values: scatter.scatter2D(ax=figure.ax[0], colorList=design["colors"][index], x=range(len(SEDData.index)), y=SEDData[index]) #Plot boxplot pairwise elif (plotType == "boxplotPairwise"): # Add Figure title, x axis limits and set the xticks figure.formatAxis( figTitle= "Box-plots for pairwise standardized Euclidean Distance from samples {}" .format(groupName), xlim=(-0.5, -0.5 + nFeatures), ylim="ignore", xticks=SEDData.index.values, xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot Box plot box.boxDF(ax=figure.ax[0], colors=SEDData["colors"].values, dat=SEDData) #Add a cutoof line cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0) figure.shrink() # Plot legend #if group: figure.makeLegend(figure.ax[0], ugColors, levels) # Add figure to PDF and close the figure afterwards figure.addToPdf(pdf)
def volcano(combo, results, oname, cutoff=2): """ Plot volcano plots. Creates volcano plots to compare means, for all pairwise differences. :Arguments: :type combo: dictionary :param combo: A dictionary of dictionaries with all possible pairwise combinations. Used this to create the various column headers in the results table. :type results: pandas DataFrame :param results: TODO :type oname: string :param oname: Name of the output file in pdf format. :type cutoff: int :param cutoff: The cutoff value for significance. :Returns: :rtype: PD :returns: Outputs a pdf file containing all plots. """ # Getting data for lpvals lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("-log10_p-value_")} # Gettign data for diffs difs = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("diff_of")} # Making plots with PdfPages(oname) as pdf: for key in sorted(difs.keys()): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # Plot all results scatter.scatter2D(x=list(difs[key]), y=list(lpvals[key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[key][lpvals[key] > cutoff] if not cutLpvals.empty: cutDiff = difs[key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=key, grid=False, yTitle="-log10(p-value) for Diff of treatment = {0}".format( key), xTitle="Diff of treatment = {0}".format(key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf)
def main(args): # Loading data through Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Unpaired permuted t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. # This variable not used in unpaired test. it just adds extra column to the data frame. # if args.order == False: number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces summary statistics for the output table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0] # print j # print p_value[j] t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1] # print j # print t_value[j] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Rounding the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there originally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Getting data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond threshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing t-test run.")
def main(args): # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to Kruscal-Wallis. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Running overall Kruscall-Wallis test for all group levels combined. # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features. # This will be used for all groups. p_value_all = [0] * number_of_features H_value_all = [0] * number_of_features mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features flag_value_all_0p01 = [0] * number_of_features flag_value_all_0p05 = [0] * number_of_features flag_value_all_0p10 = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith unique group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list = data_frame_current_group.index.tolist() # Series current for group i and row (feature) j. series_current = data_frame_current_group.loc[indexes_list[j]] # This piece of code depends on whether it is the first group in the list or not. if i == 0: series_total = [series_current] else: series_total.append(series_current) # Checking if the compared elements are different. # Combining for checking. combined_list = data_frame_manipulate_transpose.loc[ indexes_list_complete[j]].tolist() combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: # Performing Kruscal-Wallis for all groups for feature j. p_value_all[j] = float("nan") H_value_all[j] = float("nan") if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 else: # Performing Kruscal-Wallis for all groups for feature j. kruscal_wallis_args = series_total p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0] if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # The pariwise results will be added later. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list) summary_df['SampleVariance'] = variance_value_all summary_df['H_value_for_all'] = H_value_all summary_df['prob_greater_than_H_for_all'] = p_value_all flag_df = pd.DataFrame(data=flag_value_all_0p01, columns=["flag_significant_0p01_on_all_groups"], index=indexes_list) flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05 flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10 # Informing that KW for all group has been performed. logger.info( u"Kruscal-Wallis test for all groups together has been performed.") # Computing means for each group # This part just produces sumamry statistics for the output table. # This has nothing to do with Kruscal-Wallis for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]])] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_first_group = data_frame_first_group.drop(args.group, 1).transpose() data_frame_second_group = data_frame_second_group.drop(args.group, 1).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features H_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features): series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] # Checking if the compared elements are different. # Combining for checking. first_list = data_frame_first_group.loc[indexes_list[j]].tolist() second_list = data_frame_second_group.loc[indexes_list[j]].tolist() combined_list = first_list + second_list combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: p_value[j] = float("nan") H_value[j] = float("nan") # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 else: kruscal_wallis_args = [series_first, series_second] p_value[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value[j] = kruskalwallis(*kruscal_wallis_args)[0] # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Adding current p_value and flag_value column to the data frame and assigning the name p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] H_value_column_name_current = 'H_value_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[ 0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[ 0] + '_' + groups_subset[1] summary_df[p_value_column_name_current] = p_value summary_df[H_value_column_name_current] = H_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precison digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Informing that KW for pairwise group has been performed. logger.info( u"Kruscal-Wallis test for all groups pairwise has been performed.") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_groups_pairwise): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of treatment means for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of Kruscal-Wallis tests.")