Пример #1
0
def main(argv):
    n=None
    try:
      opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="])
    except getopt.GetoptError:
      print 'test.py -t <train number> -s <settings file>'
      sys.exit(2)
    for opt, arg in opts:
      if opt == '-h':
        print 'test.py -t <train number>'
        sys.exit()
      elif opt in ("-t", "--train"):
        n = int(arg)
      elif opt in ("-s", "--settings"):
        settings = arg
    print("Reading in the training data")
    train = data_io.read_train_pairs(settings)
    target = data_io.read_train_target(settings)

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier, settings)
Пример #2
0
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X, y,type_map = data_io.read_train_info());


    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed/60.0) + " Minutes")

    print ("Saving features")
    X = pd.DataFrame(extracted, index = X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)


    return X
Пример #3
0
def merge_data():
    # print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        # print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if "X_merged" not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
Пример #4
0
def merge_data():
    #print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        #print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if 'X_merged' not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
Пример #5
0
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X,
                                       y,
                                       type_map=data_io.read_train_info())

    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed / 60.0) + " Minutes")

    print("Saving features")
    X = pd.DataFrame(extracted, index=X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)

    return X
Пример #6
0
def load_train_set():
    X = data_io.load_matlab_train_features()
    if(X is None):
        print("No feature file found!")
        exit(1)
    y = data_io.read_train_target()
    X = X.fillna(0)
    return X,y
Пример #7
0
def getdata():
	total = np.fromfile('extracted/total.np')
	total = total.reshape(total.shape[0] / 24, 24)
	train_traget = data_io.read_train_target()
	sup1_traget = data_io.read_sup1_train_target()
	sup2_traget = data_io.read_sup2_train_target()
	sup3_traget = data_io.read_sup3_train_target()
	total_target = np.hstack((train_traget.Target, sup1_traget.Target, sup2_traget.Target, sup3_traget.Target))
	return total, total_target
def main():
    train = data_io.read_train_pairs()
    target = data_io.read_train_target()
    
    ## for row in train.iloc[0:4].iterrows():
    for ind in train.index[0:20]:
        pl.scatter( train['A'][ind],
                    train['B'][ind],
                    marker=".")
        pl.show()
Пример #9
0
def main():
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    target = data_io.read_train_target()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Пример #10
0
def main():

    X = data_io.load_train_features()
    if (type(X) == type(None)):
        print("No feature file found!")
        exit(1)
    y = data_io.read_train_target()

    #min_max_scaler = preprocessing.MinMaxScaler()

    #X = min_max_scaler.fit_transform(X)
    get_top_features(X, y.Target, 20)
Пример #11
0
def main():

    X = data_io.load_train_features()
    if(type(X) == type(None)):
        print("No feature file found!")
        exit(1)
    y = data_io.read_train_target()


    #min_max_scaler = preprocessing.MinMaxScaler()


    #X = min_max_scaler.fit_transform(X)
    get_top_features(X,y.Target,20)
Пример #12
0
def grid_search():

    X = data_io.load_train_features()
    if (type(X) == type(None)):
        print("No feature file found!")
        exit(1)
    y = data_io.read_train_target()

    tree_depth = [5, 7, 9, 10, 12, 14]
    learning_rate = [0.01, 0.05, 0.1, 0.2]
    scorer = Scorer(X, y)
    for d in tree_depth:
        for l in learning_rate:
            r = scorer.score([d, l])
            print "Score", d, l, r
Пример #13
0
def grid_search():
    
    X = data_io.load_train_features()
    if(type(X) == type(None)):
        print("No feature file found!")
        exit(1)
    y = data_io.read_train_target()    
    
    tree_depth = [5, 7, 9 , 10, 12, 14]
    learning_rate = [0.01, 0.05, 0.1, 0.2]
    scorer = Scorer(X,y)
    for d in tree_depth:
        for l in learning_rate:
            r = scorer.score([d,l])
            print "Score", d,l,r
Пример #14
0
def useEMMeans():
  data = data_io.read_train_pairs()
  output = data_io.read_train_target()
  y = np.array(output)
  n_components=5
  covariance_type='full' #alternatives 'diag','spherical'
  num_datas = len(data)
  #means = np.zeros((num_datas,n_components,2))
  #for i in range(num_datas):
  #  X = np.array([data.A[i],data.B[i]]).T
  #  g = GMM(n_components=n_components)
  #  g.fit(X)
  #  means[i,:,:] = g.means_

  means = np.load('gmm_means.npy')
  X = means[:,:,1]
  y = y[:,0]
  from sklearn.linear_model import LinearRegression, Perceptron
  from sklearn.tree import DecisionTreeClassifier
  from sklearn.neighbors import KNeighborsClassifier

  k = 20
  n = range(0,len(X)/2+k)
  npp = range(len(X)/2+k,len(X))

  def dostuff(csfr,X,y,n,npp):
    csfr.fit(X[n,:],y[n])
    yhat = csfr.predict(X[npp,:])
    print 1.0*sum(yhat==y[npp])/len(yhat)

  linreg = LinearRegression(normalize=True)
  dostuff(linreg,X,y,n,npp)

  p = Perceptron()
  dostuff(p,X,y,n,npp)

  dt = DecisionTreeClassifier()
  dostuff(dt,X,y,n,npp)

  knn = KNeighborsClassifier(n_neighbors=2)
  dostuff(knn,X,y,n,npp)

  r = np.random.randint(-1,3,len(y[npp]))
  r[r==2] = 0
  print 1.0*sum(r==y[npp])/len(r)
Пример #15
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()
    print train

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)
    
    features = [x[0] for x in classifier.steps[0][1].features ]

    csv_fea = csv.writer(open('features.csv','wb'))
    imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    for fea in imp:
        print fea[0], fea[1]
        csv_fea.writerow([fea[0],fea[1]])

    
    oob_score =  classifier.steps[1][1].oob_score_
    print "oob score:", oob_score
    logger = open("run_log.txt","a")
    if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n")
    else:logger.write("\n" + str(oob_score[0]) + "\n")

    print("Saving the classifier")
    data_io.save_model(classifier)
   
    print("Predicting the train set")
    train_predict = classifier.predict(train)
    trian_predict = train_predict.flatten()
    data_io.write_submission(train_predict, 'train_set', run = 'train')

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
Пример #16
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()

    X, y, info = exploit_symmetries(X, y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target'] == -1])
    print "0", len(y[y['Target'] == 0])
    print "1", len(y[y['Target'] == 1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
Пример #17
0
def main():

    y = data_io.read_train_target()
    X = data_io.load_train_features()
    if(type(X) == type(None)):
        print("No feature file found!")
        exit(1)
    
    X_old = data_io.load_features("./Models/old_csv/features_train_en_python.csv")
    print X.shape
    X = X_old.join(X)
    print X.shape
    #print X
    data_io.save_train_features(X,y)
    
    X = data_io.load_valid_features()
    X_old = data_io.load_features("./Models/old_csv/features_valid_en_python.csv")
    print X.shape
    X = X_old.join(X)
    print X.shape
    data_io.save_valid_features(X)
Пример #18
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()


    X,y, info = exploit_symmetries(X,y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target']==-1])
    print "0", len(y[y['Target']==0])
    print "1", len(y[y['Target']==1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
Пример #19
0
def main():

    numRows = 10
    targetVal = 0

    try:
        opts, args = getopt.getopt(sys.argv[1:], "n:t:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
    for o, a in opts:
        if o == "-n":
            numRows = clamp(int(a), 1, 4050)
        elif o == "-t":
            targetVal = int(a)
        else:
            print "try help: python train.py -h"
            sys.exit(2)

    print "Reading " + str(numRows) + " rows in the training data"
    train = data_io.read_train_pairs(numRows)
    target = data_io.read_train_target(numRows)

    train.A = train.A.div(train.A.apply(max) - train.A.apply(min))
    train.B = train.B.div(train.B.apply(max) - train.B.apply(min))

    train = train.convert_objects(convert_numeric=True)
    # train = train.to_numeric()

    for i in range(1, numRows):
        if target.Target[i] == targetVal:
            A = train.iloc[i, :].A
            B = train.iloc[i, :].B

            plt.figure(i)
            plt.plot(range(len(A)), A)
            plt.plot(range(len(B)), B)
            plt.savefig('plots/' + str(targetVal) + '_' + str(i) + '.png')
Пример #20
0
def main():

    y = data_io.read_train_target()
    X = data_io.load_train_features()
    if (type(X) == type(None)):
        print("No feature file found!")
        exit(1)

    X_old = data_io.load_features(
        "./Models/old_csv/features_train_en_python.csv")
    print X.shape
    X = X_old.join(X)
    print X.shape
    #print X
    data_io.save_train_features(X, y)

    X = data_io.load_valid_features()
    X_old = data_io.load_features(
        "./Models/old_csv/features_valid_en_python.csv")
    print X.shape
    X = X_old.join(X)
    print X.shape
    data_io.save_valid_features(X)
 def run(self):
     features = f.features
     train = self.getTrainingDataset()
     print "Reading preprocessed features"
     if f.preprocessedFeatures != []:
         intermediate = data_io.read_intermediate_train()
         for i in f.preprocessedFeatures:
             train[i] = intermediate[i]
         for i in features:
             if i[0] in f.preprocessedFeatures:
                 i[1] = i[0]
                 i[2] = f.SimpleTransform(transformer=f.ff.identity)
     print "Reading targets"
     target = data_io.read_train_target()
     print "Extracting features and training model"
     classifier = self.getPipeline(features)
     if self.directionForward:
         finalTarget = [x * (x + 1) / 2 for x in target.Target]
     else:
         finalTarget = [-x * (x - 1) / 2 for x in target.Target]
     classifier.fit(train, finalTarget)
     print classifier.steps[-1][1].feature_importances_
     print "Saving the classifier"
     data_io.save_model(classifier)
Пример #22
0
 def run(self):
     features = f.features
     train = self.getTrainingDataset()
     print "Reading preprocessed features"
     if f.preprocessedFeatures != []:
         intermediate = data_io.read_intermediate_train()
         for i in f.preprocessedFeatures:
             train[i] = intermediate[i]
         for i in features:
             if i[0] in f.preprocessedFeatures:
                 i[1] = i[0]
                 i[2] = f.SimpleTransform(transformer=f.ff.identity)
     print "Reading targets"
     target = data_io.read_train_target()
     print "Extracting features and training model"
     classifier = self.getPipeline(features)
     if self.directionForward:
         finalTarget = [x * (x + 1) / 2 for x in target.Target]
     else:
         finalTarget = [-x * (x - 1) / 2 for x in target.Target]
     classifier.fit(train, finalTarget)
     print classifier.steps[-1][1].feature_importances_
     print "Saving the classifier"
     data_io.save_model(classifier)
Пример #23
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train = train.append(sup)
      target = target.append(sup_target)

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    (feature_trans, classifier) = get_pipeline()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    print("Saving features")
    data_io.save_features(orig_train)

    classifier.fit(trainX, trainY)
    print("Saving the classifier")
    data_io.save_model(classifier)
 
    testX = numpy.nan_to_num(testX)
    print "Score on held-out test data ->", classifier.score(testX, testY)
    
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    feature_importrance = classifier.feature_importances_
    logger = open("feature_importance.csv","a")
    for fi in feature_importrance:
      logger.write(str(fi))
      logger.write("\n")

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
Пример #24
0
def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify", RandomForestRegressor(n_estimators=50, 
                                                verbose=2,
                                                n_jobs=2,
                                                min_samples_split=10,
                                                random_state=1,
                                                compute_importances=True))]
    return Pipeline(steps)

if __name__=="__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

### FOLDS CODE
#    folds = cval.KFold(len(train), n_folds=2, indices=False)
#   
#    results = []
#    for i, fold in enumerate(folds):
#        print("Extracting features and training model for fold " + str(i))
#        traincv, testcv = fold
#        classifier.fit(train[traincv], target[traincv])
Пример #25
0
def main():
    global cf

    start = time.clock()

    numRows = None
    cv = False
    nfold = 10
    clf_keys = ["rfg"]

    try:
        opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
    for o, a in opts:
        if o == "-n":
            numRows = clamp(int(a), 1, 4050)
        elif o == "-c":
            cv = True
            nfold = int(a)
        elif o == "-m":
            if a == "all":
                clf_keys = []
                for clf_key in cf.get_all_keys():
                    clf_keys.append(clf_key)
            elif cf.is_valid_key(a):
                clf_keys = [a]
            else:
                print "ERROR: wrong classifier name: " + a
        elif o == "-h":
            print 'options:'
            print "\t -n [number of rows]"
            print "\t -c [number of folds]"
            print "\t -m [classifier key | all]"
            sys.exit(0)
        else:
            print "try help: python train.py -h"
            sys.exit(2)

    print("Reading in the training data")
    train = data_io.read_train_pairs(numRows)
    trainInfo = data_io.read_train_info(numRows)
    train['A type'] = trainInfo['A type']
    train['B type'] = trainInfo['B type']
    target = data_io.read_train_target(numRows)

    if cv:
        data = {}
        data['train'] = train
        data['target'] = target

        for clf_key in clf_keys:
            print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key)
            crossvalidate(data, nfold, clf_key)
    else:
        for clf_key in clf_keys:
            start_train = time.clock()
            print("Extracting features and training model")
            classifier = get_pipeline(clf_key)
            classifier.fit(train, target.Target)

            print("Saving the classifier")
            data_io.save_model(classifier, clf_key)
            end_train = time.clock()
            print 'time taken:', end_train - start_train, 'seconds'

    end = time.clock()

    print 'Execution time:', round(end - start, 2)
Пример #26
0

def reverse_auc(labels, predictions):
    target_neg_one = [1 if x == -1 else 0 for x in labels]
    neg_predictions = [-x for x in predictions]
    score = metrics.auc(target_neg_one, neg_predictions)
    return score


def bidirectional_auc(labels, predictions):
    score_forward = forward_auc(labels, predictions)
    score_reverse = reverse_auc(labels, predictions)
    score = (score_forward + score_reverse) / 2.0
    return score


if __name__ == "__main__":
    import data_io

    solution = data_io.read_train_target()
    submission = data_io.read_train_predictions()

    score_forward = forward_auc(solution.Target, submission.Target)
    print("Forward Auc: %0.6f" % score_forward)

    score_reverse = reverse_auc(solution.Target, submission.Target)
    print("Reverse Auc: %0.6f" % score_reverse)

    score = bidirectional_auc(solution.Target, submission.Target)
    print("Bidirectional AUC: %0.6f" % score)
Пример #27
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train_info = train_info.append(sup_info)
      train = train.append(sup)
      target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier) 
 
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
Пример #28
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1, 4):
        print "SUP", str(i)
        sup = data_io.read_sup_pairs(i)
        sup_info = data_io.read_sup_info(i)
        sup = combine_types(sup, sup_info)
        sup = get_types(sup)
        sup_target = data_io.read_sup_target(i)
        train_info = train_info.append(sup_info)
        train = train.append(sup)
        target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train)

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier)

    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff / 60, 1)
Пример #29
0
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify",
              RandomForestRegressor(n_estimators=50,
                                    verbose=2,
                                    n_jobs=2,
                                    min_samples_split=10,
                                    random_state=1,
                                    compute_importances=True))]
    return Pipeline(steps)


if __name__ == "__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

    ### FOLDS CODE
    #    folds = cval.KFold(len(train), n_folds=2, indices=False)
    #
    #    results = []
    #    for i, fold in enumerate(folds):
    #        print("Extracting features and training model for fold " + str(i))
    #        traincv, testcv = fold
    #        classifier.fit(train[traincv], target[traincv])