def _load_and_validate_file_of_dimentions(self, datafile, dim, header): """ validates that the file contains a matrix from dimentions dim datafile is not None header - True if file contains a header (first row, which values are the column names) False if file doesn't contain an header None - let the function infer if there is an header """ if not isinstance(datafile, file): datafile = open(datafile, 'r') logging.info("Loading file %s..." % datafile.name) data, cols_labels, rows_labels = common.load_data_file(datafile.name, dim, header=header) if data is None: common.terminate("There is a problem with the format of the file '%s'." % datafile.name) return data, cols_labels, rows_labels
def __init__(self, methylation_data, reference_file, outputfile=None): """ reference_file is path to a .txt file of sites by cell type refernece matrix, including row and column headers. """ self.meth_data = methylation_data # Load and extract the reference - cell-type specific methylation levels of a group of reference sites logging.info("Loading houseman refernece file %s..." % reference_file.name) ref_data, self.names, ref_ids = common.load_data_file( reference_file.name, 2, header=True) if ref_data is None or ref_ids is None: common.terminate( "There is a problem with the format of the reference file '%s'." % reference_file.name) self.components = self.houseman(ref_data, ref_ids) self.save(outputfile) self.outputfile = outputfile
def replace_missing(data_filename, missing_value_indicator, data_max_missing_values, samples_max_missing_values, sep = " ", suffix = ".no_missing_values", header = None): """ replaces missing values by mean (mean of non-missing data/samples) and saves the output to the file named data_filename + suffix if there are too many missing samples (more than samples_max_missing_values) - they are removed if there are too mant missing datas (more than data_max_missing_values) - they are removed parames: data_filename - a matrix of type int or float (not including the missing value indicator which can be string as well). dimensions nXm where n is number of samples and m number of data(e.g sites) assumes data_filename format is sample_0, .., sample_n data_0 . . data_m transpose before sending to function if you have different format missing_value_indicator - the missing value char (int, float or string) in your data data_max_missing_values - the maximum data missing values allowed (percantage - values between 0 and 1) samples_max_missing_values - the maximum sample missing values allowed (percantage - values between 0 and 1) sep - the separator sign of the matrix in data_filename suffix - the suffix for the output filename returns array of non-missing data/samples """ dim = 2 #convert data_type from string to type ('float' --> float) data_type = DATA_TYPE original_data_type = DATA_TYPE #find the right missing value indicator type replace = False float_ind = None int_ind = None try: # a string representing an int can be converted to both float and int # but a string representing a float cant be converted to int # that's why we check the float conversion first float_ind = float(missing_value_indicator) int_ind = int(missing_value_indicator) except: pass # if both int_ind and float_int are not None - the string represent an int. # (if it represent an float - it cant be converted to int) so its enough to check the int alone # if indicator type is not str - we'll use the original datatyp # if it is str - we'll read the data as strings, find the indicator and convert it back to datatype if int_ind is not None: missing_value_indicator = int_ind elif float_ind is not None: missing_value_indicator = float_ind else: data_type = STR_DATA_TYPE replace = True try: all_data, col_names, row_names = common.load_data_file(data_filename, 2, header=header, na_values= missing_value_indicator) if all_data.ndim != dim: raw_input("Error: got data from dimensions %d while excepted to %d. Please check all paramenters are OK (data type and separator)." %(all_data.ndim, dim)) return None output_filename = data_filename + suffix # if replace: # data_type = original_data_type output_data, col_names, row_names = _replace_missing_values_in_matrix(all_data, missing_value_indicator, data_max_missing_values, samples_max_missing_values, replace, col_names, row_names) print "Output is saved to " + output_filename data_to_save = output_data if row_names is not None: data_to_save = column_stack((row_names, data_to_save)) if col_names is not None: if row_names is not None: new_header = ["ID"] + list(col_names) else: new_header = col_names data_to_save = vstack((new_header, data_to_save)) savetxt(output_filename, data_to_save, fmt='%s') return output_data except Exception as e: raw_input("Error loading data file. please check that data_type, separator and missing value indicator Ok\n%s"%e)