Exemplo n.º 1
0
def get_outfile_name(destination_dir,
                     spreadsheet_file_name,
                     transform_name,
                     file_ext='tsv',
                     timestamp=True):
    """ construct a full path output file name from destination path, spreadsheet file name,
        transformation name and file extenstion

     Args:
         destination_dir:                   where the file will be written (must already exists)
         spreadsheet_file_name:             usually the input file name (before transformation)
         transform_name:                    the operation performed on the input file
         file_ext:                          default is '.tsv'

    Returns:
        spreadsheet_transformed_file_name:  full path output file name
    """
    nix_dir, name_base = os.path.split(spreadsheet_file_name)
    name_base, nix_ext = os.path.splitext(name_base)
    name_base = name_base + '_' + transform_name
    if timestamp == True:
        name_base = kn.create_timestamped_filename(name_base, file_ext)
    else:
        name_base = name_base + '.' + file_ext

    return os.path.join(destination_dir, name_base)
def get_output_file_name(run_parameters,
                         dir_name_key,
                         prefix_string,
                         suffix_string='',
                         type_suffix='tsv'):
    """ get the full directory / filename for writing
    Args:
        run_parameters: dictionary with keys: dir_name_key, "method" and "correlation_measure"
        dir_name_key:   run_parameters dictionary key for the output directory
        prefix_string:  the first letters of the ouput file name
        suffix_string:  the last letters of the output file name before type_suffix
        type_suffix:    the file type extenstion (default 'tsv') without period character

    Returns:
        output_file_name:   full file and directory name suitable for file writing
    """
    string_1 = prefix_string + '_' + run_parameters[
        'method'] + '_' + run_parameters["correlation_measure"]
    string_2 = '_' + suffix_string + '.' + type_suffix

    output_file_name = os.path.join(run_parameters[dir_name_key], string_1)
    output_file_name = kn.create_timestamped_filename(
        output_file_name) + string_2

    return output_file_name
Exemplo n.º 3
0
def phenotype_expander(run_parameters):
    """ Run phenotype expander on the whole dataframe of phenotype data.
    Save the results to tsv file.
    """
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters['phenotype_name_full_path'])
    output_dict = run_pre_processing_phenotype_expander(
        phenotype_df, run_parameters['threshold'])

    result_df = pd.DataFrame(index=phenotype_df.index)

    for key, df_list in output_dict.items():
        if key == ColumnType.CATEGORICAL:
            for item in df_list:
                col_df = phenotype_df.loc[:, item.columns[0]].dropna()
                uniq_array = np.unique(col_df.values)
                col_names = [
                    item.columns[0] + '_' + str(i) for i in uniq_array
                ]
                cur_df = pd.DataFrame(columns=col_names, index=col_df.index)
                cur_append_df = pd.DataFrame(columns=col_names,
                                             index=phenotype_df.index)

                for i, val in enumerate(uniq_array):
                    cur_df.loc[col_df == val, col_names[i]] = 1
                    cur_df.loc[col_df != val, col_names[i]] = 0
                cur_append_df.loc[cur_df.index, :] = cur_df
                result_df = pd.concat([result_df, cur_append_df], axis=1)

    file_name = kn.create_timestamped_filename("phenotype_expander_result",
                                               "tsv")
    file_path = os.path.join(run_parameters["results_directory"], file_name)
    result_df.index.name = "sample_id"
    result_df.to_csv(file_path, header=True, index=True, sep='\t', na_rep='NA')
def clustering_evaluation(run_parameters):
    """ Run clustering evaluation on the whole dataframe of phenotype data.
    Save the results to tsv file.
    """
    cluster_phenotype_df = combine_phenotype_data_and_clustering(
        run_parameters)
    output_dict, fail_df = run_post_processing_phenotype_clustering_data(
        cluster_phenotype_df, run_parameters['threshold'])

    result_df = pd.DataFrame(index=['Measure', 'Trait_length_after_dropna', \
        'Sample_number_after_dropna', 'chi/fval', 'pval', 'SUCCESS/FAIL', 'Comments'])

    for key, df_list in output_dict.items():
        if key == ColumnType.CATEGORICAL:
            for item in df_list:
                phenotype_name = item.columns.values[1]
                result_df[phenotype_name] = chisquare(item)
        else:
            for item in df_list:
                phenotype_name = item.columns.values[1]
                result_df[phenotype_name] = f_oneway(item)

    file_name = kn.create_timestamped_filename("clustering_evaluation_result",
                                               "tsv")
    file_path = os.path.join(run_parameters["results_directory"], file_name)
    result_df = pd.concat([result_df, fail_df], axis=1)
    result_df.T.to_csv(file_path,
                       header=True,
                       index=True,
                       sep='\t',
                       na_rep='NA')
def write_predict_data(predict_df, run_parameters):
    ''' Save predict data into two-column tsv file

    Args:
        predict_df: dataframe of prediction result. The first column contains response names and the 
        second column has the corresponding predicted values
        run_parameters: dictionary of run parameters
    '''

    test_spreadsheet_name_full_path = run_parameters[
        'test_spreadsheet_name_full_path']
    results_directory = run_parameters['results_directory']
    method = run_parameters['method']
    _, output_file_name = os.path.split(test_spreadsheet_name_full_path)

    output_file_name, _ = os.path.splitext(output_file_name)
    output_file_name = os.path.join(results_directory,
                                    output_file_name + '_' + method)
    output_file_name = kn.create_timestamped_filename(
        output_file_name) + '.tsv'

    predict_df.to_csv(output_file_name,
                      sep='\t',
                      header=True,
                      index=True,
                      float_format='%g')
def save_timestamped_df(input_df, results_dir, output_file_name):
    """ Save dataframe to files with timestamped name.

    Args:
        fisher_contingency_pval: list of seven items lists.
        results_dir: directory to save outputs.
        output_file_name: file name.
    """
    file_name = kn.create_timestamped_filename(output_file_name, "df")
    kn.save_df(input_df, results_dir, file_name)
def write_predict_data(predict_df, run_parameters):
    deNada, output_file_name = os.path.split(
        run_parameters['test_spreadsheet_name_full_path'])
    output_file_name, deNada = os.path.splitext(output_file_name)
    output_file_name = os.path.join(run_parameters['results_directory'],
                                    output_file_name)
    output_file_name = kn.create_timestamped_filename(
        output_file_name) + '.tsv'

    predict_df.to_csv(output_file_name, sep='\t', header=True, index=True)
Exemplo n.º 8
0
def write_predict_data(predict_df, run_parameters):

    test_spreadsheet_name_full_path = run_parameters['test_spreadsheet_name_full_path']
    results_directory               = run_parameters['results_directory']
    method                          = run_parameters['method']
    _, output_file_name             = os.path.split(test_spreadsheet_name_full_path)

    output_file_name, _             = os.path.splitext(output_file_name)
    output_file_name                = os.path.join(results_directory, output_file_name + '_' + method)
    output_file_name                = kn.create_timestamped_filename(output_file_name) + '.tsv'

    predict_df.to_csv(output_file_name, sep='\t', header=True, index=True, float_format='%g')
Exemplo n.º 9
0
def save_cosine_matrix_df(cosine_matrix_df, run_parameters):
    """This is to save the cosine matrix df to output file

    Args:
        cosine_matrix_df: dataframe with cosine value.
        run_parameters: parameters dictionary.
    """
    new_file_name = kn.create_timestamped_filename("cosine_matrix", "df")
    cosine_matrix_df.to_csv(os.path.join(run_parameters['results_directory'],
                                         new_file_name),
                            header=True,
                            index=True,
                            sep='\t')
Exemplo n.º 10
0
    def test_create_timestamped_filename(self):
        """ assert that the beginning char string remains unchanged and that the
            size of the returned string is as expected
        """
        precision = None
        n_digits = 29
        name_base = 'test_string'
        name_extension = 'wie'
        tsfn = kn.create_timestamped_filename(name_base, name_extension,
                                              precision, n_digits)
        self.assertEqual(name_base, tsfn[0:11], msg='prefix name exception')
        n_chars = len(tsfn)
        self.assertEqual(name_extension,
                         tsfn[n_chars - 3:n_chars],
                         msg='extension name exception')

        precision = 1e-15
        tsfn = kn.create_timestamped_filename(name_base, name_extension,
                                              precision, n_digits)
        self.assertEqual(name_base, tsfn[0:11], msg='prefix name exception')
        n_chars = len(tsfn)
        self.assertEqual(name_extension,
                         tsfn[n_chars - 3:n_chars],
                         msg='extension name exception')
Exemplo n.º 11
0
def get_output_file_name(run_parameters, prefix_string, suffix_string='', type_suffix='tsv'):
    """ get the full directory / filename for writing
    Args:
        run_parameters: dictionary with keys: "results_directory", "method" and "correlation_measure"
        prefix_string:  the first letters of the ouput file name
        suffix_string:  the last letters of the output file name before '.tsv'

    Returns:
        output_file_name:   full file and directory name suitable for file writing
    """
    results_directory  = run_parameters["results_directory" ]
    method             = run_parameters['method'            ]
    similarity_measure = run_parameters['similarity_measure']

    output_file_name   = os.path.join(results_directory, prefix_string + '_' + method + '_' + similarity_measure)
    output_file_name   = kn.create_timestamped_filename(output_file_name) + '_' + suffix_string + '.' + type_suffix
    return output_file_name
def map_and_save_droplist(spreadsheet_df, gene_names, droplist_name, run_parameters):
    """This is to map and save droplist

    Args:
        spreadsheet_df: user supplied spreadsheet dataframe.
        gene_names: list of genes.
        droplist_name: name of droplist file.
        run_parameters: dictionary of run parameters.

    Returns:
        property_rank_df: dataframe with ranked property names in each column.
    """

    gene_names_map    = run_parameters["gene_names_map"]
    results_directory = run_parameters['results_directory']

    droplist        = kn.find_dropped_node_names(spreadsheet_df, gene_names)
    map_df          = pd.read_csv(gene_names_map, index_col=0, header=None, sep='\t')
    new_droplist_df = pd.DataFrame(map_df.loc[droplist].values, columns=[droplist_name])
    file_name       = kn.create_timestamped_filename(droplist_name, "tsv")

    kn.save_df(new_droplist_df, results_directory, file_name)