예제 #1
0
def plotCutoffs(cut_S, ax, p):
    """
    Plot the cutoff lines to each plot

    :Arguments:
        :type cut_S: pandas.Series
        :param cut_S: contains a cutoff value, name and color

        :type ax: matplotlib.axes._subplots.AxesSubplot
        :param ax: Gets an ax project.

        :type p: float
        :param p: percentile of cutoff
    """
    lines.drawCutoffHoriz(ax=ax,
                          y=float(cut_S.values[0]),
                          cl=cutPalette.ugColors[cut_S.name],
                          lb="{0} {1}% Threshold: {2}".format(
                              cut_S.name, round(p * 100, 3),
                              round(float(cut_S.values[0]), 1)),
                          ls="--",
                          lw=2)
예제 #2
0
def qqPlot(tresid, tfit, oname):
    """ 
    Plot the residual diagnostic plots by sample.

    Output q-q plot, boxplots and distributions of the residuals. These plots
    will be used diagnose if residuals are approximately normal.

    :Arguments:
        :type tresid: pandas.Series
        :param tresid: Pearson normalized residuals. (transposed)
                        (residuals / sqrt(MSE))

        :type tfit: pandas DataFrame
        :param tfit: output of the ANOVA (transposed)

        :type oname: string
        :param oname: Name of the output file in pdf format.

    :Returns:
        :rtype: PDF
        :returns: Outputs a pdf file containing all plots.

    """
    #Open pdf
    with PdfPages(oname) as pdf:

        # Stablishing axisLayout
        axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)]

        # Start plotting
        for col in tresid.columns:
            #Creating figure
            fig = figureHandler(proj='2d',
                                numAx=4,
                                numRow=2,
                                numCol=3,
                                arrangement=axisLayout)

            data = tresid[col].values.ravel()
            noColors = list()
            for j in range(0, len(data)):
                noColors.append('b')  #blue
            df_data = pd.DataFrame(data)

            # Removing missing so that it will plot correctly.
            mask_nan_data = np.isnan(data)
            data = data[~mask_nan_data]

            # Plot qqplot on axis 0
            sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0])

            # Plot boxplot on axis 1
            box.boxSeries(ser=data, ax=fig.ax[1])

            # Plot histogram on axis 2
            hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal')

            # Plot scatterplot on axis 3
            scatter.scatter2D(ax=fig.ax[3],
                              x=tfit[col],
                              y=tresid[col],
                              colorList=list('b'))

            # Draw cutoff line for scatterplot on axis 3
            lines.drawCutoffHoriz(ax=fig.ax[3], y=0)

            # Format axis 0
            fig.formatAxis(figTitle=col,
                           axnum=0,
                           grid=False,
                           showX=True,
                           yTitle="Sample Quantiles",
                           xTitle=" ")

            # Format axis 1
            fig.formatAxis(axnum=1,
                           axTitle="Standardized Residuals",
                           grid=False,
                           showX=False,
                           showY=True,
                           xTitle=" ")

            # Format axis 2
            fig.formatAxis(axnum=2,
                           grid=False,
                           showX=True,
                           showY=True,
                           axTitle=" ",
                           xTitle=" ")

            # Format axis 3
            fig.formatAxis(axnum=3,
                           axTitle="Predicted Values vs Residual Values",
                           xTitle="Predicted Values",
                           yTitle="Residual Values",
                           grid=False)

            #Add figure to pdf
            fig.addToPdf(pdfPages=pdf)
예제 #3
0
def main(args):

    # Loading data trought Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, 
                        runOrder=args.order, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)
    
    # Cleaning from missing data
    dat.dropMissing()


    # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    if args.pairing == "unpaired":
       logger.info("Unpaired t-test will be performed for all groups pairwise.")
  

       # Getting the uinique pairs and all pairwise prermutations
       # son that we will feed them to pairwise unpaired t-tests.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       groups_pairwise = list(combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # This variable is useless for unpared test. it just adds extra column to the data frame.	
       if args.order == False:
          number_of_features = data_frame.shape[1] - 1
       else:
          number_of_features = data_frame.shape[1] - 2
       # Saving treatment group name from the arguments.



       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):
  

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()
           else:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  [args.group, args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()
           else:
              data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value
           
           


       # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
       for i in range(0, number_of_groups_pairwise ):
        
           # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
           groups_subset = groups_pairwise[i] 
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()
           else:
              data_frame_first_group  = data_frame_first_group.drop(  [args.group, args.order], 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the first one (they are the same)
           indexes_list = data_frame_first_group.index.tolist()

           # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
           p_value           = [0] * number_of_features
           t_value           = [0] * number_of_features
           neg_log10_p_value = [0] * number_of_features
           flag_value_0p01   = [0] * number_of_features
           flag_value_0p05   = [0] * number_of_features
           flag_value_0p10   = [0] * number_of_features
           difference_value  = [0] * number_of_features


           for j in range(0, number_of_features ):
       
               series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
               series_second = data_frame_second_group.loc[ indexes_list[j] ]

               ttest_ind_args = [series_first, series_second]
               p_value[j] = ttest_ind( *ttest_ind_args )[1]
               t_value[j] = ttest_ind( *ttest_ind_args )[0]
               # Possible alternative for two groups.
               # p_value[j] = ttest_ind_args(series_first, series_second)[1]
	       neg_log10_p_value[j] = - np.log10(p_value[j])
               difference_value[j] = series_first.mean() - series_second.mean()
               if p_value[j] < 0.01: flag_value_0p01[j] = 1
               if p_value[j] < 0.05: flag_value_0p05[j] = 1
               if p_value[j] < 0.10: flag_value_0p10[j] = 1


           # Creating column names for the data frame.
           p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
           difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
           flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

           # Adding current p_value and flag_value column to the data frame and assigning the name.
           # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
           if i == 0:
              flag_df     =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
           else:
              flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

           # At this point data frame exists so only columns are added to the existing data frame.
           summary_df[p_value_column_name_current]           = p_value
           summary_df[t_value_column_name_current]           = t_value
           summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
           summary_df[difference_value_column_name_current]  = difference_value
           flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
           flag_df[flag_value_column_name_current_0p10] = flag_value_0p10
  


    # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups.
    # Each sample in one group should have exacty one matching pair in the other group. 
    # The matching is controlled by args.order variable.

    if args.pairing == "paired":
       logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order))


       # Getting the number of unique groups. If it is bigger than 2 return the warning and exit.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       if number_of_unique_groups != 2:
          logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) )
          exit()	
 
       # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. 

       # Creating pairwise combination of our two groups that we will use in the future.
       groups_pairwise = list( combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # Checking that the requred pairing variable has been provided.
       if args.order == False:
          logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.")
          exit()	


       # This piece of code will be executed only if the args.order has been provided and the check is passed. 

       # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order
       number_of_features = data_frame.shape[1] - 2

       # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided.
       # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups.

       # Getting the unique pairs and deleting those theat have more or less than three.
       pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze()
       pairid_values_series_unique = pairid_values_series.unique()
       number_of_unique_pairid = pairid_values_series_unique.shape[0]


       # Extracting data from the interface.
       data_frame = dat.transpose()
  
       # Extracting the number of samples in the final frame.
       number_of_samples = data_frame.shape[0]


       # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups.
       # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning.
       # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed.
       for i in range(0, number_of_unique_pairid ):
       
           # Extracting the pieces of the data frame that belong to ith unique pairid.
           data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ]  )]

           # We transpose here so it will be easier to operate with.
           data_frame_current_pairid  = data_frame_current_pairid.transpose()
           sample_names_current_pairid = list(data_frame_current_pairid.columns.values)
       
           if data_frame_current_pairid.shape[1] != 2:

              # Pulling indexes list from the current data frame.
              logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i],
                               data_frame_current_pairid.shape[1], sample_names_current_pairid)  )

              # Getting indexes we are trying to delete.
              boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
              # Deleting the indexes and in the for loop going to next iteration.
              data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)
    
           # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2:
           # Here we are checking if the groupID-s for the given pair are indeed different.

           elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]:

                logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i],       		                         data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid)  )
                   
                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
                # Deleting the indexes.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)


        
       # Cheching if the data frame bacame empty after cleaning.
       if data_frame.shape[0] == 0:
          logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program."  )
          exit()	
   


       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           data_frame_manipulate_transpose  = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value




       # Performing paired t-test for the two groups and saving the results.

       # Creating p_values and flag_values emply list of length number_of_features.
       # This will be used for thw two groups in paired t-test.
       p_value = [0] * number_of_features
       t_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       difference_value  = [0] * number_of_features

       # Performing paired t-test for each pair of features.
       for j in range(0, number_of_features ):

  
           # Extracting the pieces of the data frame that belong to 1st group.
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]]  )]

        
           # Sorting data frame by args.group index 
           # This will ensure datasets are aligned by pair when fed to the t-test.
           data_frame_first_group  = data_frame_first_group.sort(args.order)
           data_frame_second_group = data_frame_second_group.sort(args.order)


           # Sorting data frame by args.group index 
           data_frame_first_group  = data_frame_first_group.drop(  [args.group,args.order], 1 ).transpose()
           data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose()
         
           # Pulling list of indexes. This is the same list for the first and for the second.
           indexes_list = data_frame_first_group.index.tolist()

           # Pullinng the samples out
           series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
           series_second = data_frame_second_group.loc[ indexes_list[j] ]


           # Running t-test for the two given samples
           paired_ttest_args = [series_first, series_second]
           p_value[j] = ttest_rel( *paired_ttest_args )[1]
           t_value[j] = ttest_rel( *paired_ttest_args )[0]
           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1


       # The loop over features has to be finished by now. Converting them into the data frame.    


       # Creating column names for the data frame.
       p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       t_value_column_name_current           = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       difference_value_column_name_current  = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01'
       flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05'
       flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10'


       summary_df[t_value_column_name_current] = t_value
       summary_df[p_value_column_name_current] = p_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current] = difference_value

       flag_df  =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10



   
    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")




    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2



    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
         for i in range(0, number_of_groups_pairwise ):
             # Set Up Figure
             volcanoPlot = figureHandler(proj="2d")


             groups_subset = groups_pairwise[i] 
             current_key =  groups_subset[0] + '_' + groups_subset[1]
             
             # Plot all results
             scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), 
                                colorList=list('b'), ax=volcanoPlot.ax[0])

             # Color results beyond treshold red
             cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
             if not cutLpvals.empty:
                    cutDiff = difs[current_key][cutLpvals.index]
                    scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), 
                                      colorList=list('r'), ax=volcanoPlot.ax[0])

             # Drawing cutoffs
             lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

             # Format axis (volcanoPlot)
             volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                 yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                 xTitle="Difference of treatment means for {0}".format(current_key))

             # Add figure to PDF
             volcanoPlot.addToPdf(pdfPages=pdf)
  
    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")



    # Ending script
    logger.info(u"Finishing running of t-test.")
예제 #4
0
def main(args):
    # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default).
    if args.group != False:
        logger.info(
            u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}."""
            .format(args.group, args.mu))

        # Loading data trought Interface.
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           group=args.group,
                           logger=logger)

        # Treat everything as numeric.
        dat.wide = dat.wide.applymap(float)

        # Cleaning from the missing data.
        dat.dropMissing()

        # Getting the uinique group values so that we will feed them to the t-tests.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. We subtract 1 since we have provided args.group
        number_of_features = data_frame.shape[1] - 1
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with the single sample t-test.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also transpose here so it will be easier to operate with.
            data_frame_manipulate_transpose = data_frame_manipulate.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries for feature j.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Running single sample t-test for all groups.
        # We are also computing means for each group and outputting them.
        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            # Creating p values, difference values,  neg_log10_p_value, t-value, flag_value lists filled wiht 0es.
            means_value = [0] * number_of_features
            difference_value = [0] * number_of_features
            p_value = [0] * number_of_features
            t_value = [0] * number_of_features
            neg_log10_p_value = [0] * number_of_features
            flag_value_0p01 = [0] * number_of_features
            flag_value_0p05 = [0] * number_of_features
            flag_value_0p10 = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

                # Performing one sample t-test
                ttest_1samp_args = [series_current, float(args.mu)]
                p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
                t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = means_value[j] - float(args.mu)
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            # Creating names for the current analysis columns and adding result columns to the data frame.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[
                i] + '_' + args.mu
            difference_value_column_name_current = 'diff_of_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[
                i] + '_' + args.mu

            # Adding flag_value column to the data frame and assigning the name.
            # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it.
            if i == 0:
                flag_df = pd.DataFrame(
                    data=flag_value_0p01,
                    columns=[flag_value_column_name_current_0p01],
                    index=indexes_list)
            else:
                flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

            # At this point data frames (summary and flags) exist so only columns are added to the existing data frame.
            summary_df[means_value_column_name_current] = means_value
            summary_df[p_value_column_name_current] = p_value
            summary_df[t_value_column_name_current] = t_value
            summary_df[
                neg_log10_p_value_column_name_current] = neg_log10_p_value
            summary_df[difference_value_column_name_current] = difference_value
            flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
            flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default).
    if args.group == False:
        logger.info(
            u"""t-test will be performed for the entire dataset since goruping variable was not provided."""
        )

        # Loading data trough the interface
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           logger=logger)

        # Treat everything as numeric
        dat.wide = dat.wide.applymap(float)

        # Cleaning from missing data
        dat.dropMissing()

        # Saving the number of unique groups that will be used for plotting.
        # Since we did not feed any grouping variable it is exactly one.
        number_of_unique_groups = 1

        # Extracting data from the interface.
        data_frame = dat.wide.transpose()
        # Extracting number of features. We do not subtract 1 since we have not provided args.group
        number_of_features = data_frame.shape[1]
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with single sample t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features
        # Creating array of means for the current group that will be filled.
        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        t_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        difference_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features

        for j in range(0, number_of_features):
            # We transpose here so data will be easier to operate on.
            data_frame_manipulate_transpose = data_frame.transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

            # Performing one sample t-test for the entire dataset.
            ttest_1samp_args = [
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                float(args.mu)
            ]
            p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
            t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
            neg_log10_p_value[j] = -np.log10(p_value[j])
            difference_value[j] = mean_value_all[j] - float(args.mu)
            if p_value[j] < 0.01: flag_value_0p01[j] = 1
            if p_value[j] < 0.05: flag_value_0p05[j] = 1
            if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Creating names for the current analysis columns and adding result columns to the data frame.
        means_value_column_name_current = 'mean_treatment_all'
        p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu
        t_value_column_name_current = 't_value_for_diff_all_' + args.mu
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu
        difference_value_column_name_current = 'diff_of_all_' + args.mu
        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu

        summary_df[means_value_column_name_current] = mean_value_all
        summary_df[p_value_column_name_current] = p_value
        summary_df[t_value_column_name_current] = t_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_df = pd.DataFrame(data=flag_value_0p01,
                               columns=[flag_value_column_name_current_0p01],
                               index=indexes_list_complete)
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_unique_groups):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # If no grouping variable is provided.
            if number_of_unique_groups == 1:
                current_key = 'all_' + args.mu
            else:
                current_key = group_values_series_unique[i] + '_' + args.mu

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of the means from H0 for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    logger.info(u"Volcano plots have been created.")
    logger.info(u"Finishing running of t-test.")
예제 #5
0
def volcano(combo, results, oname, cutoff=2):
    """ 
    Plot volcano plots.

    Creates volcano plots to compare means, for all pairwise differences.

    :Arguments:

        :type combo: dictionary
        :param combo: A dictionary of dictionaries with all possible pairwise
            combinations. Used this to create the various column headers in the
            results table.

        :type results: pandas DataFrame
        :param results: TODO

        :type oname: string
        :param oname: Name of the output file in pdf format.
       
        :type cutoff: int
        :param cutoff: The cutoff value for significance.

    :Returns:
        :rtype: PD
        :returns: Outputs a pdf file containing all plots.

    """
    # Getting data for lpvals
    lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("-log10_p-value_")}

    # Gettign data for diffs
    difs   = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("diff_of")}

    # Making plots
    with PdfPages(oname) as pdf:
        for key in sorted(difs.keys()):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # Plot all results
            scatter.scatter2D(x=list(difs[key]),
                              y=list(lpvals[key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[key][lpvals[key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment = {0}".format(
                    key),
                xTitle="Diff of treatment = {0}".format(key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)
예제 #6
0
def main(args):

    # Loading data through Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Unpaired permuted t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    logger.info("Unpaired t-test will be performed for all groups pairwise.")

    # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique,2) ) 
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    # This variable not used in unpaired test. it just adds extra column to the data frame.	
#    if args.order == False:
    number_of_features = data_frame.shape[1] - 1

    # Saving treatment group name from the arguments.
    # Computing overall summaries (mean and variance).
    # This part just produces summary statistics for the output table.
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features

    for j in range(0, number_of_features ):
        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        # We should either drop 1 or 2 columns depending whether we fed the second one.
        data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()

        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
        variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)

    # Creating the table and putting the results there.
    summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
    summary_df['SampleVariance'] =  variance_value_all


    # Computing means for each group and outputting them.
    # This part just produces summary statistics for the output table.
    for i in range(0, number_of_unique_groups ):
       # Extracting the pieces of the data frame that belong to the ith group.
       data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()

       # Pulling indexes list from the current group.
       indexes_list = data_frame_current_group.index.tolist()

       # Creating array of means for the current group that will be filled.
       means_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_current = data_frame_current_group.loc[ indexes_list[j] ] 
           means_value[j] = series_current.mean()

       # Adding current mean_value column to the data frame and assigning the name.
       means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
       summary_df[means_value_column_name_current] = means_value


    # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
    for i in range(0, number_of_groups_pairwise ):
       # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
       groups_subset = groups_pairwise[i]
       data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
       data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
       data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()

       # Pulling indexes list from the first one (they are the same)
       indexes_list = data_frame_first_group.index.tolist()

       # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
       p_value           = [0] * number_of_features
       t_value           = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       difference_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_first  = data_frame_first_group.loc[ indexes_list[j] ]
           series_second = data_frame_second_group.loc[ indexes_list[j] ]

           p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0]    
#           print j
#           print p_value[j]
           t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1]
#           print j
#           print t_value[j]

           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1

       # Creating column names for the data frame.
       p_value_column_name_current           = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
       difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
       flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

       # Adding current p_value and flag_value column to the data frame and assigning the name.
       # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
       if i == 0:
          flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       else:
          flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

       # At this point data frame exists so only columns are added to the existing data frame.
       summary_df[p_value_column_name_current]           = p_value
       summary_df[t_value_column_name_current]           = t_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current]  = difference_value
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10




    # Rounding the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there originally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")



    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("neg_log10_p_value")}

    # Getting data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2

    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
        for i in range(0, number_of_groups_pairwise ):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key =  groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0])

            # Color results beyond threshold red
            cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                xTitle="Difference of treatment means for {0}".format(current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing t-test run.")
예제 #7
0
def main(args):

    # Loading data trought Interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqueID,
                       group=args.group,
                       logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting the uinique pairs and all pairwise prermutations
    # son that we will feed them to Kruscal-Wallis.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique, 2))
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    number_of_features = data_frame.shape[1] - 1
    # Saving treatment group name from the arguments.

    # Running overall Kruscall-Wallis test for all group levels combined.

    # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features.
    # This will be used for all groups.
    p_value_all = [0] * number_of_features
    H_value_all = [0] * number_of_features
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features
    flag_value_all_0p01 = [0] * number_of_features
    flag_value_all_0p05 = [0] * number_of_features
    flag_value_all_0p10 = [0] * number_of_features

    for j in range(0, number_of_features):

        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_manipulate_transpose = data_frame_manipulate.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
        variance_value_all[j] = np.var(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
            ddof=1)

        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to ith unique group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list = data_frame_current_group.index.tolist()

            # Series current for group i and row (feature) j.
            series_current = data_frame_current_group.loc[indexes_list[j]]

            # This piece of code depends on whether it is the first group in the list or not.
            if i == 0:
                series_total = [series_current]
            else:
                series_total.append(series_current)

        # Checking if the compared elements are different.
        # Combining for checking.
        combined_list = data_frame_manipulate_transpose.loc[
            indexes_list_complete[j]].tolist()
        combined_list_unique = np.unique(combined_list)
        # Checking if the number of unique elements is exactly 1.
        if len(combined_list_unique) == 1:
            # Performing Kruscal-Wallis for all groups for feature j.
            p_value_all[j] = float("nan")
            H_value_all[j] = float("nan")
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

        else:
            # Performing Kruscal-Wallis for all groups for feature j.
            kruscal_wallis_args = series_total
            p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1]
            H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0]
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

    # The loop over features has to be finished by now. Converting them into the data frame.
    # The pariwise results will be added later.
    summary_df = pd.DataFrame(data=mean_value_all,
                              columns=["GrandMean"],
                              index=indexes_list)
    summary_df['SampleVariance'] = variance_value_all
    summary_df['H_value_for_all'] = H_value_all
    summary_df['prob_greater_than_H_for_all'] = p_value_all
    flag_df = pd.DataFrame(data=flag_value_all_0p01,
                           columns=["flag_significant_0p01_on_all_groups"],
                           index=indexes_list)
    flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05
    flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10

    # Informing that KW for all group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups together has been performed.")

    # Computing means for each group
    # This part just produces sumamry statistics for the output table.
    # This has nothing to do with Kruscal-Wallis

    for i in range(0, number_of_unique_groups):

        # Extracting the pieces of the data frame that belong to ith group.
        data_frame_current_group = data_frame.loc[data_frame[args.group].isin(
            [group_values_series_unique[i]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_current_group = data_frame_current_group.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current group.
        indexes_list = data_frame_current_group.index.tolist()

        # Creating array of means for the current group that will be filled.
        means_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_current = data_frame_current_group.loc[indexes_list[j]]
            means_value[j] = series_current.mean()

        means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
            i]
        summary_df[means_value_column_name_current] = means_value

    # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise.

    for i in range(0, number_of_groups_pairwise):

        # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
        groups_subset = groups_pairwise[i]
        data_frame_first_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[0]])]
        data_frame_second_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[1]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_first_group = data_frame_first_group.drop(args.group,
                                                             1).transpose()
        data_frame_second_group = data_frame_second_group.drop(args.group,
                                                               1).transpose()
        # Pulling indexes list from the first one (they are the same)
        indexes_list = data_frame_first_group.index.tolist()

        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        H_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features
        difference_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_first = data_frame_first_group.loc[indexes_list[j]]
            series_second = data_frame_second_group.loc[indexes_list[j]]

            # Checking if the compared elements are different.
            # Combining for checking.
            first_list = data_frame_first_group.loc[indexes_list[j]].tolist()
            second_list = data_frame_second_group.loc[indexes_list[j]].tolist()
            combined_list = first_list + second_list
            combined_list_unique = np.unique(combined_list)
            # Checking if the number of unique elements is exactly 1.
            if len(combined_list_unique) == 1:
                p_value[j] = float("nan")
                H_value[j] = float("nan")
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            else:
                kruscal_wallis_args = [series_first, series_second]
                p_value[j] = kruskalwallis(*kruscal_wallis_args)[1]
                H_value[j] = kruskalwallis(*kruscal_wallis_args)[0]
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Adding current p_value and flag_value column to the data frame and assigning the name
        p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        H_value_column_name_current = 'H_value_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[
            0] + '_' + groups_subset[1]
        difference_value_column_name_current = 'diff_of_' + groups_subset[
            0] + '_' + groups_subset[1]
        summary_df[p_value_column_name_current] = p_value
        summary_df[H_value_column_name_current] = H_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_df[flag_value_column_name_current_0p01] = flag_value_0p01
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precison digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Informing that KW for pairwise group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups pairwise has been performed.")

    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_groups_pairwise):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key = groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of treatment means for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing running of Kruscal-Wallis tests.")