示例#1
0
def process_data(data):

    # this function coverts the input file into animals features and converts categorical into a specific form of pseudocode

    log.info('Preprocessing data')
    msg = 'Found features:'
    log.info(msg)
    logger.write_to_html(OUTHTML, '\n' + msg)
    vector_features = []
    for c, f in enumerate(FEATURES):
        if data[f].dtypes == object:
            msg = '     Categorical feature: %s' % f
            log.info(msg)
            logger.write_to_html(OUTHTML, msg)
            msg = '         Unique values: %i' % len(data[f].unique())
            log.info(msg)
            logger.write_to_html(OUTHTML, msg)
            log.info(data[f].value_counts())
            categories = [x for x, n in data[f].value_counts().iteritems()]
            for category, n in data[f].value_counts().iteritems(
            ):  #categories:
                log.info('Numerizing category: %s', category)
                data[category] = (data[f] == category).astype(float)
                vector_features.append(category)
        else:
            msg = '     Numerical feature: %s' % f
            log.info(msg)
            logger.write_to_html(OUTHTML, msg)
            vector_features.append(f)
    log.info('Preprocessing complete')
    log.info('')

    return data, vector_features
示例#2
0
def write_group_difference(groups, output, features):

    # this helper function writes paired feature per group differences to the output for later manual quality control

    org_features = list(output.columns.values)
    num_features = [feat for feat in org_features if feat in features]
    topic = 'Difference significance of numerical features between groups'
    logger.topic_to_html(OUTHTML, topic)
    for feat in num_features:
        msg = 'Feature: %s' % feat
        logger.write_to_html(OUTHTML, msg)
        difs = np.zeros(shape=(len(groups), len(groups)))
        ps = []
        for group, other_group in list(itertools.combinations(groups, 2)):
            df1 = output.loc[output['Group'] == group.id]
            df2 = output.loc[output['Group'] == other_group.id]
            t, p = compare_features(df1, df2, feat)
            ps.append(p)
            msg = '    Group %i & group %i     p = %.4f' % (group.id,
                                                            other_group.id, p)
            logger.write_to_html(OUTHTML, msg)
        msg = '      Mean significance     p = %.4f' % (np.mean(ps))
        logger.write_to_html(OUTHTML, msg)
        msg = '<br>'
        logger.write_to_html(OUTHTML, msg)
示例#3
0
def finalize_latin(data):

    # this function saves the finished latin square to the output

    save_name = 'latin_%s' % FILE_NAME + '.' + FILE_FORMAT
    save_data(data, save_name)
    msg = 'Grouped data available under %s' % OUTPUT_PATH + save_name
    logger.write_to_html(OUTHTML, msg)
    msg = data.iloc[:, -N_GROUPS:].to_html()
    logger.table_to_html(OUTHTML, msg, 'Experimental groups')
    if os.path.isfile(OUTHTML):
        logger.finish_html(OUTHTML)
        webbrowser.open_new_tab(OUTHTML)
    log.info('Exiting script...')
示例#4
0
def initialize():

    # this function reads the input data and initializes the output html file

    logger.init_html(OUTHTML, OUTPUT_PATH, INPUT_PATH, N_GROUPS)
    data = load_data(INPUT_PATH)
    msg = 'Number of animals: %i' % len(data)
    log.info(msg)
    msg = 'Number of groups to create: %i' % N_GROUPS
    log.info(msg)
    logger.write_to_html(OUTHTML, 'Number of animals: %i' % len(data))
    logger.write_to_html(OUTHTML, '<br>')
    original_data = data.copy()

    return data, original_data
示例#5
0
def run(data, original_data):

    # this function calls the workflow according to the inference structure

    data, features = process_data(data)
    plt = plot_data(original_data)
    data = standardscale(data, features)
    animals = make_animals(data, features)
    groups = make_groups(features)
    assign_animals(animals, groups)
    if N_GROUPS > 2:
        optimize_groups(animals, groups, features)
    msg = '<br>'
    logger.write_to_html(OUTHTML, msg)

    return data, features, animals, groups
示例#6
0
def finalize(original_data, animals, groups, features):

    # this function finalizes the output after the inference structure is complete

    output = make_output(original_data, animals, groups)
    plot_result(output)
    write_group_difference(groups, output, features)
    output = blind_output(output)
    save_name = 'blinded_%s' % FILE_NAME + '.' + FILE_FORMAT
    save_data(output, save_name)
    msg = 'Grouped data available under %s' % OUTPUT_PATH + save_name
    write_group_table(output)
    logger.write_to_html(OUTHTML, msg)
    if os.path.isfile(OUTHTML):
        logger.finish_html(OUTHTML)
        webbrowser.open_new_tab(OUTHTML)
    log.info('Exiting script...')
示例#7
0
def equalize_groups(animals, groups):

    # this function equalizies the group sizes while preserving group feature similarity

    log.info('Equalizing group sizes')
    mean_p = 0.0
    largest_group_id = None
    largest_group_size = 0
    smallest_group_id = None
    smallest_group_size = 100000000
    for group in groups:
        log.info('')
        log.info('Group ID: %i', group.id)
        log.info('Group size: %i', len(group.animals))
        if len(group.animals) > largest_group_size:
            largest_group_size = len(group.animals)
            largest_group = group
        if len(group.animals) < smallest_group_size:
            smallest_group_size = len(group.animals)
            smallest_group = group
        for other_group in groups:
            if group == other_group:
                pass
            else:
                t, p = compare_groups(group, other_group)
                mean_p = np.mean(p)
                log.info(
                    'mean feature difference between group %i and %i: %.3f',
                    group.id, other_group.id, mean_p)
    log.info('mean difference: %.3f', mean_p)
    log.info('largest group: %i', largest_group.id)
    log.info('smallest group %i:', smallest_group.id)
    while largest_group_size - smallest_group_size > 1:
        log.info('Group sizes are not balanced!')
        log.info('checking for best animal to reassign')
        many_animals = largest_group.animals
        best_mean_p = 0.0
        best_animal = None
        for animal in many_animals:
            log.info('')
            largest_group.release(animal)
            smallest_group.assign(animal)
            t, p = compare_groups(largest_group, smallest_group)
            mean_p = np.mean(p)
            if mean_p > best_mean_p:
                best_mean_p = mean_p
                best_animal = animal
            smallest_group.release(animal)
            largest_group.assign(animal)
        log.info('')
        log.info('best animal to switch is: %s with new mean p = %.6f',
                 best_animal.id, best_mean_p)
        largest_group.release(best_animal)
        smallest_group.assign(best_animal)
        log.info('')
        largest_group_id = None
        largest_group_size = 0
        smallest_group_id = None
        smallest_group_size = 100000000
        for group in groups:
            log.info('')
            log.info('Group ID: %i', group.id)
            log.info('Group size: %i', len(group.animals))
            if len(group.animals) > largest_group_size:
                largest_group_size = len(group.animals)
                largest_group = group
            if len(group.animals) < smallest_group_size:
                smallest_group_size = len(group.animals)
                smallest_group = group
    else:
        log.info('')
        msg = 'Group sizes are balanced!'
        log.info(msg)
        logger.write_to_html(OUTHTML, msg)
        for group in groups:
            msg = '     Group %i contains %i animals' % (group.id,
                                                         len(group.animals))
            log.info(msg)
            logger.write_to_html(OUTHTML, msg)
            for other_group in groups:
                if group == other_group:
                    pass
                else:
                    t, p = compare_groups(group, other_group)
                    mean_p = np.mean(p)
                    log.info(
                        'mean feature difference between group %i and %i: %.3f',
                        group.id, other_group.id, mean_p)
    log.info('Equalizing group size complete!')
    log.info('')

    return