示例#1
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB),
                ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]

    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)

    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)

    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)

    print("Writing feature file")
    data_io.write_real_features('add_noise', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('A: Normalized Entropy', 'A', f.normalized_entropy),
                ('B: Normalized Entropy', 'B', f.normalized_entropy),
                ('Pearson R', ['A','B'], f.correlation),
                ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation
                ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'),
                ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Spearman rank correlation', ['A','B'], f.rcorrelation),
                ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'),
                ('Kurtosis A', 'A', f.fkurtosis),
                ('Kurtosis B', 'B', f.fkurtosis),
                ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'),
                ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Unique ratio A', 'A', f.unique_ratio),
                ('Unique ratio B', 'B', f.unique_ratio),
                ('Skew A', 'A', f.fskew),
                ('Skew B', 'B', f.fskew),
                ('Skew difference', 'derived', 'output[key][14] - output[key][15]'),
                ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'),
                ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'),
                ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'),
                ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Moment 5 A', 'A', f.standard_moment_5),
                ('Moment 5 B', 'B', f.standard_moment_5),
                ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'),
                ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Moment 6 A', 'A', f.standard_moment_6),
                ('Moment 6 B', 'B', f.standard_moment_6),
                ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'),
                ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'),
                ('Moment 7 A', 'A', f.standard_moment_7),
                ('Moment 7 B', 'B', f.standard_moment_7),
                ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'),
                ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Moment 8 A', 'A', f.standard_moment_8),
                ('Moment 8 B', 'B', f.standard_moment_8),
                ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'),
                ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'),
                ('Moment 9 A', 'A', f.standard_moment_9),
                ('Moment 9 B', 'B', f.standard_moment_9),
                ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'),
                ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('high_order_moments', all_features, feature_names)
示例#4
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB),
                ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA),
                ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'),
                ('ICGI slope AB', ['A','B'], f.icgi_slope_AB),
                ('ICGI slope BA', ['A','B'], f.icgi_slope_BA),
                ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#,
                #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT),
                #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT),
                #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'),
                #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT),
                #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT),
                #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('icgi', all_features, feature_names)
def main():

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return
        
    features = [('Number of Samples', 'A', len),
                ('Max A', 'A', max),
                ('Max B', 'B', max),
                ('Min A', 'A', min),
                ('Min B', 'B', min),
                ('Mean A', 'A', f.mean),
                ('Mean B', 'B', f.mean),
                ('Median A', 'A', f.median),
                ('Median B', 'B', f.median),
                ('Sd A', 'A', f.sd),
                ('Sd B', 'B', f.sd)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('unreasonable_features', all_features, feature_names)
示例#6
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Kendall tau', ['A','B'], f.kendall),
                ('Kendall tau p', ['A','B'], f.kendall_p),
                ('Mann Whitney', ['A','B'], f.mannwhitney),
                ('Mann Whitney p', ['A','B'], f.mannwhitney_p),
                #('Wilcoxon', ['A','B'], f.wilcoxon),
                #('Wilcoxon p', ['A','B'], f.wilcoxon_p),
                ('Kruskal', ['A','B'], f.kruskal),
                ('Kruskal p', ['A','B'], f.kruskal_p),
                ]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('corrs', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Injectivity 10', ['A','B'], f.injectivity_10),
                ('Injectivity 15', ['A','B'], f.injectivity_15),
                ('Injectivity 20', ['A','B'], f.injectivity_20),
                ('Injectivity 25', ['A','B'], f.injectivity_25),
                ('Injectivity 30', ['A','B'], f.injectivity_30),
                ('Injectivity 35', ['A','B'], f.injectivity_35),
                ('Injectivity 40', ['A','B'], f.injectivity_40)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('injectivity', all_features, feature_names)