예제 #1
0
    def _load_and_validate_file_of_dimentions(self, datafile, dim, header):
        """
        validates that the file contains a matrix from dimentions dim
        datafile is not None
        header - True if file contains a header (first row, which values are the column names)
             False if file doesn't contain an header
             None - let the function infer if there is an header
        """
        if not isinstance(datafile, file):
            datafile = open(datafile, 'r')
  
        logging.info("Loading file %s..." % datafile.name)

        data, cols_labels, rows_labels = common.load_data_file(datafile.name, dim, header=header)
        if data is None:
            common.terminate("There is a problem with the format of the file '%s'." % datafile.name)
        return data, cols_labels, rows_labels
예제 #2
0
    def __init__(self, methylation_data, reference_file, outputfile=None):
        """
         reference_file is path to a .txt file of sites by cell type refernece matrix, including row and column headers.
        """
        self.meth_data = methylation_data

        # Load and extract the reference - cell-type specific methylation levels of a group of reference sites
        logging.info("Loading houseman refernece file %s..." %
                     reference_file.name)
        ref_data, self.names, ref_ids = common.load_data_file(
            reference_file.name, 2, header=True)
        if ref_data is None or ref_ids is None:
            common.terminate(
                "There is a problem with the format of the reference file '%s'."
                % reference_file.name)

        self.components = self.houseman(ref_data, ref_ids)
        self.save(outputfile)
        self.outputfile = outputfile
예제 #3
0
def replace_missing(data_filename, missing_value_indicator, data_max_missing_values, samples_max_missing_values, sep = " ", suffix = ".no_missing_values", header = None):
    """
    replaces missing values by mean (mean of non-missing data/samples) and saves the output to the file named data_filename + suffix
    if there are too many missing samples (more than samples_max_missing_values) - they are removed
    if there are too mant missing datas (more than data_max_missing_values) - they are removed

    parames:
    data_filename - a matrix of type int or float (not including the missing value indicator which can be string as well). dimensions nXm where n is number of samples and m number of data(e.g sites)
    assumes data_filename format is

            sample_0, .., sample_n
    data_0
    .
    .
    data_m

    transpose before sending to function if you have different format

    missing_value_indicator - the missing value char (int, float or string) in your data 
    data_max_missing_values - the maximum data missing values allowed  (percantage - values between 0 and 1)
    samples_max_missing_values - the maximum sample missing values allowed  (percantage - values between 0 and 1)
    sep - the separator sign of the matrix in data_filename
    suffix - the suffix for the output filename

    returns array of non-missing data/samples
    """

    dim = 2

    #convert data_type from string to type ('float' --> float)
    data_type = DATA_TYPE
    original_data_type = DATA_TYPE

    
    #find the right missing value indicator type 
    replace = False
    float_ind = None
    int_ind = None
    try:
        # a string representing an int can be converted to both float and int
        # but a string representing a float cant be converted to int
        # that's why we check the float conversion first
        float_ind = float(missing_value_indicator) 
        int_ind = int(missing_value_indicator)
    except:
        pass
    # if both int_ind and float_int are not None - the string represent an int.
    # (if it represent an float -  it cant be converted to int) so its enough to check the int alone

    # if indicator type is not str - we'll use the original datatyp
    # if it is str - we'll read the data as strings, find the indicator and convert it back to datatype
    if int_ind is not None: 
        missing_value_indicator = int_ind
    elif float_ind is not None:
        missing_value_indicator = float_ind
    else:
        data_type = STR_DATA_TYPE
        replace = True

    try:
        all_data, col_names, row_names = common.load_data_file(data_filename, 2, header=header, na_values= missing_value_indicator)

        if all_data.ndim != dim:
            raw_input("Error: got data from dimensions %d while excepted to %d. Please check all paramenters are OK (data type and separator)." %(all_data.ndim, dim))
            return None
        output_filename = data_filename + suffix

        # if replace:
        #     data_type = original_data_type

        output_data, col_names, row_names = _replace_missing_values_in_matrix(all_data, missing_value_indicator, data_max_missing_values, samples_max_missing_values, replace, col_names, row_names)
        print "Output is saved to " + output_filename

        data_to_save = output_data
        if row_names is not None:
            data_to_save = column_stack((row_names, data_to_save))
        if col_names is not None:
            if row_names is not None:
                new_header = ["ID"] + list(col_names)
            else:
                new_header = col_names
            data_to_save = vstack((new_header, data_to_save))
            
        savetxt(output_filename, data_to_save, fmt='%s')
        return output_data

    except Exception as e:
        raw_input("Error loading data file. please check that data_type, separator and missing value indicator Ok\n%s"%e)