示例#1
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB),
                ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]

    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)

    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)

    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)

    print("Writing feature file")
    data_io.write_real_features('add_noise', all_features, feature_names)
示例#2
0
def merge_data():
    # print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        # print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if "X_merged" not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
示例#3
0
def ext():
    # Read the pairs
    print "Read Pairs"
    print "Read Train"
    train = d.read_train_pairs()
    print "Read Valid"
    valid = d.read_valid_pairs()
    print "Read Sup1"
    sup1 = d.read_sup1_train_pairs()
    print "Read Sup2"
    sup2 = d.read_sup2_train_pairs()
    print "Read Sup3"
    sup3 = d.read_sup3_train_pairs()

    # Get the feature extractor
    combined = feat.feature_extractor()

    # Extract the features
    print 'Extract the features'
    print "Extract Train"
    train_att = combined.fit_transform(train)
    print "Extract Valid"
    valid_att = combined.fit_transform(valid)
    print "Extract Sup1"
    sup1_att = combined.fit_transform(sup1)
    print "Extract Sup2"
    sup2_att = combined.fit_transform(sup2)
    print "Extract Sup3"
    sup3_att = combined.fit_transform(sup3)

    print "Join"
    total_new_att = np.vstack((train_att, valid_att, sup1_att, sup2_att, sup3_att))

    # Save extracted data
    np.save('total_new_att.npy', total_new_att)
示例#4
0
def main(argv):
    n=None
    try:
      opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="])
    except getopt.GetoptError:
      print 'test.py -t <train number> -s <settings file>'
      sys.exit(2)
    for opt, arg in opts:
      if opt == '-h':
        print 'test.py -t <train number>'
        sys.exit()
      elif opt in ("-t", "--train"):
        n = int(arg)
      elif opt in ("-s", "--settings"):
        settings = arg
    print("Reading in the training data")
    train = data_io.read_train_pairs(settings)
    target = data_io.read_train_target(settings)

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier, settings)
示例#5
0
def extrair_tudo():
    combined = new_features1()

    print "Train"
    train = d.read_train_pairs()
    train_att = combined.fit_transform(train)
    np.save(train_att, open("train_att.npy", "wb"))


    print "Train1"
    valid = d.read_valid_pairs()
    valid_att = combined.fit_transform(valid)
    np.save(valid_att, open("valid_att.npy", "wb"))

    print "Train2"
    sup1 = d.read_sup1_train_pairs()
    sup1_att = combined.fit_transform(sup1)
    np.save(sup1_att, open("sup1_att.npy", "wb"))

    print "Train3"
    sup2 = d.read_sup2_train_pairs()
    sup2_att = combined.fit_transform(sup2)
    np.save(sup1_att, open("sup2_att.npy", "wb"))

    print "Train4"
    sup3 = d.read_sup3_train_pairs()
    sup3_att = combined.fit_transform(sup3)
    np.save(sup1_att, open("sup3_att.npy", "wb"))
示例#6
0
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X,
                                       y,
                                       type_map=data_io.read_train_info())

    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed / 60.0) + " Minutes")

    print("Saving features")
    X = pd.DataFrame(extracted, index=X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)

    return X
示例#7
0
文件: fe.py 项目: diogo149/causality
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X, y,type_map = data_io.read_train_info());


    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed/60.0) + " Minutes")

    print ("Saving features")
    X = pd.DataFrame(extracted, index = X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)


    return X
示例#8
0
def merge_data():
    #print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        #print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if 'X_merged' not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
示例#9
0
 def getTrainingDataset(self):
     print "Reading in the training data"
     train = data_io.read_train_pairs()
     print "Reading the information about the training data"
     train2 = data_io.read_train_info()
     train["A type"] = train2["A type"]
     train["B type"] = train2["B type"]
     return train
 def getTrainingDataset(self):
     print "Reading in the training data"
     train = data_io.read_train_pairs()
     print "Reading the information about the training data"
     train2 = data_io.read_train_info()
     train["A type"] = train2["A type"]
     train["B type"] = train2["B type"]
     return train
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
def main():
    train = data_io.read_train_pairs()
    target = data_io.read_train_target()
    
    ## for row in train.iloc[0:4].iterrows():
    for ind in train.index[0:20]:
        pl.scatter( train['A'][ind],
                    train['B'][ind],
                    marker=".")
        pl.show()
示例#14
0
def main():
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    target = data_io.read_train_target()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('A: Normalized Entropy', 'A', f.normalized_entropy),
                ('B: Normalized Entropy', 'B', f.normalized_entropy),
                ('Pearson R', ['A','B'], f.correlation),
                ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation
                ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'),
                ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Spearman rank correlation', ['A','B'], f.rcorrelation),
                ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'),
                ('Kurtosis A', 'A', f.fkurtosis),
                ('Kurtosis B', 'B', f.fkurtosis),
                ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'),
                ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Unique ratio A', 'A', f.unique_ratio),
                ('Unique ratio B', 'B', f.unique_ratio),
                ('Skew A', 'A', f.fskew),
                ('Skew B', 'B', f.fskew),
                ('Skew difference', 'derived', 'output[key][14] - output[key][15]'),
                ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'),
                ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'),
                ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'),
                ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Moment 5 A', 'A', f.standard_moment_5),
                ('Moment 5 B', 'B', f.standard_moment_5),
                ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'),
                ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Moment 6 A', 'A', f.standard_moment_6),
                ('Moment 6 B', 'B', f.standard_moment_6),
                ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'),
                ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'),
                ('Moment 7 A', 'A', f.standard_moment_7),
                ('Moment 7 B', 'B', f.standard_moment_7),
                ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'),
                ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Moment 8 A', 'A', f.standard_moment_8),
                ('Moment 8 B', 'B', f.standard_moment_8),
                ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'),
                ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'),
                ('Moment 9 A', 'A', f.standard_moment_9),
                ('Moment 9 B', 'B', f.standard_moment_9),
                ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'),
                ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('high_order_moments', all_features, feature_names)
示例#17
0
def useEMMeans():
  data = data_io.read_train_pairs()
  output = data_io.read_train_target()
  y = np.array(output)
  n_components=5
  covariance_type='full' #alternatives 'diag','spherical'
  num_datas = len(data)
  #means = np.zeros((num_datas,n_components,2))
  #for i in range(num_datas):
  #  X = np.array([data.A[i],data.B[i]]).T
  #  g = GMM(n_components=n_components)
  #  g.fit(X)
  #  means[i,:,:] = g.means_

  means = np.load('gmm_means.npy')
  X = means[:,:,1]
  y = y[:,0]
  from sklearn.linear_model import LinearRegression, Perceptron
  from sklearn.tree import DecisionTreeClassifier
  from sklearn.neighbors import KNeighborsClassifier

  k = 20
  n = range(0,len(X)/2+k)
  npp = range(len(X)/2+k,len(X))

  def dostuff(csfr,X,y,n,npp):
    csfr.fit(X[n,:],y[n])
    yhat = csfr.predict(X[npp,:])
    print 1.0*sum(yhat==y[npp])/len(yhat)

  linreg = LinearRegression(normalize=True)
  dostuff(linreg,X,y,n,npp)

  p = Perceptron()
  dostuff(p,X,y,n,npp)

  dt = DecisionTreeClassifier()
  dostuff(dt,X,y,n,npp)

  knn = KNeighborsClassifier(n_neighbors=2)
  dostuff(knn,X,y,n,npp)

  r = np.random.randint(-1,3,len(y[npp]))
  r[r==2] = 0
  print 1.0*sum(r==y[npp])/len(r)
示例#18
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()
    print train

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)
    
    features = [x[0] for x in classifier.steps[0][1].features ]

    csv_fea = csv.writer(open('features.csv','wb'))
    imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    for fea in imp:
        print fea[0], fea[1]
        csv_fea.writerow([fea[0],fea[1]])

    
    oob_score =  classifier.steps[1][1].oob_score_
    print "oob score:", oob_score
    logger = open("run_log.txt","a")
    if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n")
    else:logger.write("\n" + str(oob_score[0]) + "\n")

    print("Saving the classifier")
    data_io.save_model(classifier)
   
    print("Predicting the train set")
    train_predict = classifier.predict(train)
    trian_predict = train_predict.flatten()
    data_io.write_submission(train_predict, 'train_set', run = 'train')

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
示例#19
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()

    X, y, info = exploit_symmetries(X, y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target'] == -1])
    print "0", len(y[y['Target'] == 0])
    print "1", len(y[y['Target'] == 1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
示例#20
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB),
                ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA),
                ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'),
                ('ICGI slope AB', ['A','B'], f.icgi_slope_AB),
                ('ICGI slope BA', ['A','B'], f.icgi_slope_BA),
                ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#,
                #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT),
                #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT),
                #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'),
                #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT),
                #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT),
                #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('icgi', all_features, feature_names)
示例#21
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()


    X,y, info = exploit_symmetries(X,y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target']==-1])
    print "0", len(y[y['Target']==0])
    print "1", len(y[y['Target']==1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
def main():

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return
        
    features = [('Number of Samples', 'A', len),
                ('Max A', 'A', max),
                ('Max B', 'B', max),
                ('Min A', 'A', min),
                ('Min B', 'B', min),
                ('Mean A', 'A', f.mean),
                ('Mean B', 'B', f.mean),
                ('Median A', 'A', f.median),
                ('Median B', 'B', f.median),
                ('Sd A', 'A', f.sd),
                ('Sd B', 'B', f.sd)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('unreasonable_features', all_features, feature_names)
示例#23
0
def main():

    numRows = 10
    targetVal = 0

    try:
        opts, args = getopt.getopt(sys.argv[1:], "n:t:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
    for o, a in opts:
        if o == "-n":
            numRows = clamp(int(a), 1, 4050)
        elif o == "-t":
            targetVal = int(a)
        else:
            print "try help: python train.py -h"
            sys.exit(2)

    print "Reading " + str(numRows) + " rows in the training data"
    train = data_io.read_train_pairs(numRows)
    target = data_io.read_train_target(numRows)

    train.A = train.A.div(train.A.apply(max) - train.A.apply(min))
    train.B = train.B.div(train.B.apply(max) - train.B.apply(min))

    train = train.convert_objects(convert_numeric=True)
    # train = train.to_numeric()

    for i in range(1, numRows):
        if target.Target[i] == targetVal:
            A = train.iloc[i, :].A
            B = train.iloc[i, :].B

            plt.figure(i)
            plt.plot(range(len(A)), A)
            plt.plot(range(len(B)), B)
            plt.savefig('plots/' + str(targetVal) + '_' + str(i) + '.png')
def main():
    extractor = feature_extractor()
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = extractor.fit_transform(train[:])
    
    print("Reading in the ensemble training data")
    ensemble_train = data_io.read_ensemble_train_pairs()

    print("Extracting features from ensemble training data")
    ensemble_train_features = extractor.fit_transform(ensemble_train[:])
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = extractor.fit_transform(valid[:])
    
    all_features = np.concatenate((train_features, ensemble_train_features, valid_features))
    
    print("Concatenating names")
    train_names = [train.irow(i).name for i in range(len(train))]
    ensemble_train_names = [ensemble_train.irow(i).name for i in range(len(ensemble_train))]
    valid_names = [valid.irow(i).name for i in range(len(valid))]
    all_names = train_names + ensemble_train_names + valid_names
    
    print("Writing feature file")
    feature_names = ['Number of Samples',
                     'A: Number of Unique Samples',
                     'B: Number of Unique Samples',
                     'A: Normalized Entropy',
                     'B: Normalized Entropy',
                     'Pearson R',
                     'Pearson R Magnitude',
                     'Entropy Difference']
    data_io.write_real_features('benchmark_features', all_names, all_features, feature_names)
示例#25
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Kendall tau', ['A','B'], f.kendall),
                ('Kendall tau p', ['A','B'], f.kendall_p),
                ('Mann Whitney', ['A','B'], f.mannwhitney),
                ('Mann Whitney p', ['A','B'], f.mannwhitney_p),
                #('Wilcoxon', ['A','B'], f.wilcoxon),
                #('Wilcoxon p', ['A','B'], f.wilcoxon_p),
                ('Kruskal', ['A','B'], f.kruskal),
                ('Kruskal p', ['A','B'], f.kruskal_p),
                ]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('corrs', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Injectivity 10', ['A','B'], f.injectivity_10),
                ('Injectivity 15', ['A','B'], f.injectivity_15),
                ('Injectivity 20', ['A','B'], f.injectivity_20),
                ('Injectivity 25', ['A','B'], f.injectivity_25),
                ('Injectivity 30', ['A','B'], f.injectivity_30),
                ('Injectivity 35', ['A','B'], f.injectivity_35),
                ('Injectivity 40', ['A','B'], f.injectivity_40)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('injectivity', all_features, feature_names)
示例#27
0
    proba_nz = proba[np.nonzero(proba)]
    probb_nz = probb[np.nonzero(probb)]

    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == "__main__":

    print "Reading in {} data...".format(DATA)

    if DATA == "train":
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == "valid":
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print "Saving coded info matrix..."
    codes = np.zeros(info.values.shape)
    lookup = {"Numerical": 1, "Categorical": 2, "Binary": 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat("matlab/{}info.mat".format(DATA), {"codes": codes}, oned_as="column")
示例#28
0
def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify",
              RandomForestRegressor(n_estimators=50,
                                    verbose=2,
                                    n_jobs=2,
                                    min_samples_split=10,
                                    random_state=1,
                                    compute_importances=True))]
    return Pipeline(steps)


if __name__ == "__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

    ### FOLDS CODE
    #    folds = cval.KFold(len(train), n_folds=2, indices=False)
    #
    #    results = []
    #    for i, fold in enumerate(folds):
    #        print("Extracting features and training model for fold " + str(i))
    #        traincv, testcv = fold
示例#29
0
    return combined

def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify", RandomForestRegressor(n_estimators=50, 
                                                verbose=2,
                                                n_jobs=2,
                                                min_samples_split=10,
                                                random_state=1,
                                                compute_importances=True))]
    return Pipeline(steps)

if __name__=="__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

### FOLDS CODE
#    folds = cval.KFold(len(train), n_folds=2, indices=False)
#   
#    results = []
#    for i, fold in enumerate(folds):
#        print("Extracting features and training model for fold " + str(i))
#        traincv, testcv = fold
示例#30
0
def main():
    global cf

    start = time.clock()

    numRows = None
    cv = False
    nfold = 10
    clf_keys = ["rfg"]

    try:
        opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
    for o, a in opts:
        if o == "-n":
            numRows = clamp(int(a), 1, 4050)
        elif o == "-c":
            cv = True
            nfold = int(a)
        elif o == "-m":
            if a == "all":
                clf_keys = []
                for clf_key in cf.get_all_keys():
                    clf_keys.append(clf_key)
            elif cf.is_valid_key(a):
                clf_keys = [a]
            else:
                print "ERROR: wrong classifier name: " + a
        elif o == "-h":
            print 'options:'
            print "\t -n [number of rows]"
            print "\t -c [number of folds]"
            print "\t -m [classifier key | all]"
            sys.exit(0)
        else:
            print "try help: python train.py -h"
            sys.exit(2)

    print("Reading in the training data")
    train = data_io.read_train_pairs(numRows)
    trainInfo = data_io.read_train_info(numRows)
    train['A type'] = trainInfo['A type']
    train['B type'] = trainInfo['B type']
    target = data_io.read_train_target(numRows)

    if cv:
        data = {}
        data['train'] = train
        data['target'] = target

        for clf_key in clf_keys:
            print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key)
            crossvalidate(data, nfold, clf_key)
    else:
        for clf_key in clf_keys:
            start_train = time.clock()
            print("Extracting features and training model")
            classifier = get_pipeline(clf_key)
            classifier.fit(train, target.Target)

            print("Saving the classifier")
            data_io.save_model(classifier, clf_key)
            end_train = time.clock()
            print 'time taken:', end_train - start_train, 'seconds'

    end = time.clock()

    print 'Execution time:', round(end - start, 2)
示例#31
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train = train.append(sup)
      target = target.append(sup_target)

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    (feature_trans, classifier) = get_pipeline()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    print("Saving features")
    data_io.save_features(orig_train)

    classifier.fit(trainX, trainY)
    print("Saving the classifier")
    data_io.save_model(classifier)
 
    testX = numpy.nan_to_num(testX)
    print "Score on held-out test data ->", classifier.score(testX, testY)
    
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    feature_importrance = classifier.feature_importances_
    logger = open("feature_importance.csv","a")
    for fi in feature_importrance:
      logger.write(str(fi))
      logger.write("\n")

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train_info = train_info.append(sup_info)
      train = train.append(sup)
      target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier) 
 
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1, 4):
        print "SUP", str(i)
        sup = data_io.read_sup_pairs(i)
        sup_info = data_io.read_sup_info(i)
        sup = combine_types(sup, sup_info)
        sup = get_types(sup)
        sup_target = data_io.read_sup_target(i)
        train_info = train_info.append(sup_info)
        train = train.append(sup)
        target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train)

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier)

    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff / 60, 1)
示例#34
0
    proba_nz = proba[np.nonzero(proba)]
    probb_nz = probb[np.nonzero(probb)]

    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == '__main__':

    print 'Reading in {} data...'.format(DATA)

    if DATA == 'train':
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == 'valid':
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print 'Saving coded info matrix...'
    codes = np.zeros(info.values.shape)
    lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat('matlab/{}info.mat'.format(DATA), {'codes': codes},
            oned_as='column')