def process_data(data): # this function coverts the input file into animals features and converts categorical into a specific form of pseudocode log.info('Preprocessing data') msg = 'Found features:' log.info(msg) logger.write_to_html(OUTHTML, '\n' + msg) vector_features = [] for c, f in enumerate(FEATURES): if data[f].dtypes == object: msg = ' Categorical feature: %s' % f log.info(msg) logger.write_to_html(OUTHTML, msg) msg = ' Unique values: %i' % len(data[f].unique()) log.info(msg) logger.write_to_html(OUTHTML, msg) log.info(data[f].value_counts()) categories = [x for x, n in data[f].value_counts().iteritems()] for category, n in data[f].value_counts().iteritems( ): #categories: log.info('Numerizing category: %s', category) data[category] = (data[f] == category).astype(float) vector_features.append(category) else: msg = ' Numerical feature: %s' % f log.info(msg) logger.write_to_html(OUTHTML, msg) vector_features.append(f) log.info('Preprocessing complete') log.info('') return data, vector_features
def write_group_difference(groups, output, features): # this helper function writes paired feature per group differences to the output for later manual quality control org_features = list(output.columns.values) num_features = [feat for feat in org_features if feat in features] topic = 'Difference significance of numerical features between groups' logger.topic_to_html(OUTHTML, topic) for feat in num_features: msg = 'Feature: %s' % feat logger.write_to_html(OUTHTML, msg) difs = np.zeros(shape=(len(groups), len(groups))) ps = [] for group, other_group in list(itertools.combinations(groups, 2)): df1 = output.loc[output['Group'] == group.id] df2 = output.loc[output['Group'] == other_group.id] t, p = compare_features(df1, df2, feat) ps.append(p) msg = ' Group %i & group %i p = %.4f' % (group.id, other_group.id, p) logger.write_to_html(OUTHTML, msg) msg = ' Mean significance p = %.4f' % (np.mean(ps)) logger.write_to_html(OUTHTML, msg) msg = '<br>' logger.write_to_html(OUTHTML, msg)
def finalize_latin(data): # this function saves the finished latin square to the output save_name = 'latin_%s' % FILE_NAME + '.' + FILE_FORMAT save_data(data, save_name) msg = 'Grouped data available under %s' % OUTPUT_PATH + save_name logger.write_to_html(OUTHTML, msg) msg = data.iloc[:, -N_GROUPS:].to_html() logger.table_to_html(OUTHTML, msg, 'Experimental groups') if os.path.isfile(OUTHTML): logger.finish_html(OUTHTML) webbrowser.open_new_tab(OUTHTML) log.info('Exiting script...')
def initialize(): # this function reads the input data and initializes the output html file logger.init_html(OUTHTML, OUTPUT_PATH, INPUT_PATH, N_GROUPS) data = load_data(INPUT_PATH) msg = 'Number of animals: %i' % len(data) log.info(msg) msg = 'Number of groups to create: %i' % N_GROUPS log.info(msg) logger.write_to_html(OUTHTML, 'Number of animals: %i' % len(data)) logger.write_to_html(OUTHTML, '<br>') original_data = data.copy() return data, original_data
def run(data, original_data): # this function calls the workflow according to the inference structure data, features = process_data(data) plt = plot_data(original_data) data = standardscale(data, features) animals = make_animals(data, features) groups = make_groups(features) assign_animals(animals, groups) if N_GROUPS > 2: optimize_groups(animals, groups, features) msg = '<br>' logger.write_to_html(OUTHTML, msg) return data, features, animals, groups
def finalize(original_data, animals, groups, features): # this function finalizes the output after the inference structure is complete output = make_output(original_data, animals, groups) plot_result(output) write_group_difference(groups, output, features) output = blind_output(output) save_name = 'blinded_%s' % FILE_NAME + '.' + FILE_FORMAT save_data(output, save_name) msg = 'Grouped data available under %s' % OUTPUT_PATH + save_name write_group_table(output) logger.write_to_html(OUTHTML, msg) if os.path.isfile(OUTHTML): logger.finish_html(OUTHTML) webbrowser.open_new_tab(OUTHTML) log.info('Exiting script...')
def equalize_groups(animals, groups): # this function equalizies the group sizes while preserving group feature similarity log.info('Equalizing group sizes') mean_p = 0.0 largest_group_id = None largest_group_size = 0 smallest_group_id = None smallest_group_size = 100000000 for group in groups: log.info('') log.info('Group ID: %i', group.id) log.info('Group size: %i', len(group.animals)) if len(group.animals) > largest_group_size: largest_group_size = len(group.animals) largest_group = group if len(group.animals) < smallest_group_size: smallest_group_size = len(group.animals) smallest_group = group for other_group in groups: if group == other_group: pass else: t, p = compare_groups(group, other_group) mean_p = np.mean(p) log.info( 'mean feature difference between group %i and %i: %.3f', group.id, other_group.id, mean_p) log.info('mean difference: %.3f', mean_p) log.info('largest group: %i', largest_group.id) log.info('smallest group %i:', smallest_group.id) while largest_group_size - smallest_group_size > 1: log.info('Group sizes are not balanced!') log.info('checking for best animal to reassign') many_animals = largest_group.animals best_mean_p = 0.0 best_animal = None for animal in many_animals: log.info('') largest_group.release(animal) smallest_group.assign(animal) t, p = compare_groups(largest_group, smallest_group) mean_p = np.mean(p) if mean_p > best_mean_p: best_mean_p = mean_p best_animal = animal smallest_group.release(animal) largest_group.assign(animal) log.info('') log.info('best animal to switch is: %s with new mean p = %.6f', best_animal.id, best_mean_p) largest_group.release(best_animal) smallest_group.assign(best_animal) log.info('') largest_group_id = None largest_group_size = 0 smallest_group_id = None smallest_group_size = 100000000 for group in groups: log.info('') log.info('Group ID: %i', group.id) log.info('Group size: %i', len(group.animals)) if len(group.animals) > largest_group_size: largest_group_size = len(group.animals) largest_group = group if len(group.animals) < smallest_group_size: smallest_group_size = len(group.animals) smallest_group = group else: log.info('') msg = 'Group sizes are balanced!' log.info(msg) logger.write_to_html(OUTHTML, msg) for group in groups: msg = ' Group %i contains %i animals' % (group.id, len(group.animals)) log.info(msg) logger.write_to_html(OUTHTML, msg) for other_group in groups: if group == other_group: pass else: t, p = compare_groups(group, other_group) mean_p = np.mean(p) log.info( 'mean feature difference between group %i and %i: %.3f', group.id, other_group.id, mean_p) log.info('Equalizing group size complete!') log.info('') return