def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if 'SampleID' != columns[0]: columns = ['SampleID'] + columns # process concatenated columns if needed merge = [] for column in columns: if '&&' in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [ headers.index(header_name) for header_name in new_column.split('&&') ] # join all the fields of the metadata that are listed in indices for line in data: line.append(''.join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name) ] # remove categories where there is only one value if single == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name) ] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0] + '_%d' % index] + element[1::] for element in data]) data = out_data return data, headers
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if "SampleID" != columns[0]: columns = ["SampleID"] + columns # process concatenated columns if needed merge = [] for column in columns: if "&&" in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [headers.index(header_name) for header_name in new_column.split("&&")] # join all the fields of the metadata that are listed in indices for line in data: line.append("".join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name) ] # remove categories where there is only one value if single == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name) ] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0] + "_%d" % index] + element[1::] for element in data]) data = out_data return data, headers