示例#1
0
文件: util.py 项目: teravest/emperor
def preprocess_mapping_file(data,
                            headers,
                            columns,
                            unique=False,
                            single=False,
                            clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if 'SampleID' != columns[0]:
        columns = ['SampleID'] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if '&&' in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [
            headers.index(header_name)
            for header_name in new_column.split('&&')
        ]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append(''.join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::]
                if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data,
                                                       headers,
                                                       columns_to_remove,
                                                       negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + '_%d' % index] + element[1::]
                             for element in data])
        data = out_data

    return data, headers
示例#2
0
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0):
    """Process a mapping file to expand the data or remove unuseful fields

    Inputs:
    data: mapping file data
    headers: mapping file headers
    columns: list of headers to keep, if one of these headers includes two
    ampersands, this function will create a new column by merging the delimited
    columns.
    unique: keep columns where all values are unique
    single: keep columns where all values are the same
    clones: number of times to replicate the metadata

    Outputs:
    data: processed mapping file data
    headers: processed mapping file headers
    """

    # The sample ID must always be there, else it's meaningless data
    if "SampleID" != columns[0]:
        columns = ["SampleID"] + columns

    # process concatenated columns if needed
    merge = []
    for column in columns:
        if "&&" in column:
            merge.append(column)
    # each element needs several columns to be merged
    for new_column in merge:
        indices = [headers.index(header_name) for header_name in new_column.split("&&")]

        # join all the fields of the metadata that are listed in indices
        for line in data:
            line.append("".join([line[index] for index in indices]))
        headers.append(new_column)

    # remove all unique or singled valued columns
    if unique or single:
        columns_to_remove = []
        metadata = MetadataMap(mapping_file_to_dict(data, headers), [])

        # find columns that have values that are all unique
        if unique == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name)
            ]

        # remove categories where there is only one value
        if single == True:
            columns_to_remove += [
                column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name)
            ]
        columns_to_remove = list(set(columns_to_remove))

        # remove the single or unique columns
        data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True)

    # remove anything not specified in the input
    data, headers = keep_columns_from_mapping_file(data, headers, columns)

    # sanitize the mapping file data and headers
    data, headers = sanitize_mapping_file(data, headers)

    # clones mean: replicate the metadata retagging the sample ids with a suffix
    if clones:
        out_data = []
        for index in range(0, clones):
            out_data.extend([[element[0] + "_%d" % index] + element[1::] for element in data])
        data = out_data

    return data, headers