示例#1
0
def merge_data():
    # print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        # print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if "X_merged" not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
示例#2
0
def merge_data():
    #print train_data_dir + "/train_pair*"
    train_pairs = glob.glob(train_data_dir + "/*train_pairs*")
    print list(zip(train_pairs, list(xrange(0, 4))))

    for i, train_pair in enumerate(train_pairs):
        dir_name = ntpath.dirname(train_pair)
        pref = ntpath.basename(train_pair).split("train_pairs")[0]
        suffix = ntpath.basename(train_pair).split("train_pairs")[-1]
        #print pref, suffix
        info = dir_name + "/" + pref + "train_publicinfo" + suffix
        target = dir_name + "/" + pref + "train_target" + suffix
        print info, pref, suffix
        X = data_io.read_train_pairs(train_pair)
        y = data_io.read_train_target(target)
        inf_data = data_io.read_train_info(info)
        X, y, inf_data = process_indices(X, y, inf_data, i)
        if 'X_merged' not in locals():
            X_merged = X
            y_merged = y
            info_merged = inf_data
        else:
            print "Shape before appending", X_merged.shape, y_merged.shape, X.shape, y.shape
            X_merged = X_merged.append(X)
            y_merged = y_merged.append(y)
            info_merged = info_merged.append(inf_data)
            print "Shape thus far", X_merged.shape, y_merged.shape

    return X_merged, y_merged, info_merged
示例#3
0
文件: fe.py 项目: diogo149/causality
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X, y,type_map = data_io.read_train_info());


    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed/60.0) + " Minutes")

    print ("Saving features")
    X = pd.DataFrame(extracted, index = X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)


    return X
示例#4
0
def extract_train_features():

    start = time.time()
    features = feature_extractor()
    header = []
    for h in features.features:
        header.append(h[0])

    print("Reading in the training data")

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()

    #X = X.iloc[1:7]
    #y = y.iloc[1:7]
    print("Extracting features: " + str(X.shape))

    extracted = features.fit_transform(X,
                                       y,
                                       type_map=data_io.read_train_info())

    elapsed = float(time.time() - start)
    print("Features extracted in " + str(elapsed / 60.0) + " Minutes")

    print("Saving features")
    X = pd.DataFrame(extracted, index=X.index)
    X.columns = header
    data_io.save_train_features(X, y.Target)

    return X
示例#5
0
def selectAllMixed(X, y):
    info = data_io.read_train_info()
    info = info[(info['A type'] != "Numerical") | (info['B type'] != "Numerical")]

    X = X.ix[info.index]
    y = y.ix[X.index]
    return X, y
示例#6
0
 def getTrainingDataset(self):
     print "Reading in the training data"
     train = data_io.read_train_pairs()
     print "Reading the information about the training data"
     train2 = data_io.read_train_info()
     train["A type"] = train2["A type"]
     train["B type"] = train2["B type"]
     return train
 def getTrainingDataset(self):
     print "Reading in the training data"
     train = data_io.read_train_pairs()
     print "Reading the information about the training data"
     train2 = data_io.read_train_info()
     train["A type"] = train2["A type"]
     train["B type"] = train2["B type"]
     return train
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
 def getDataset(self):
     if self.getTrain:
         readData = data_io.read_train_pairs()
         readData2 = data_io.read_train_info()
     else:
         readData = data_io.read_valid_pairs()
         readData2 = data_io.read_valid_info()
     readData["A type"] = readData2["A type"]
     readData["B type"] = readData2["B type"]
     return readData
示例#10
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()
    print train

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)
    
    features = [x[0] for x in classifier.steps[0][1].features ]

    csv_fea = csv.writer(open('features.csv','wb'))
    imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    for fea in imp:
        print fea[0], fea[1]
        csv_fea.writerow([fea[0],fea[1]])

    
    oob_score =  classifier.steps[1][1].oob_score_
    print "oob score:", oob_score
    logger = open("run_log.txt","a")
    if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n")
    else:logger.write("\n" + str(oob_score[0]) + "\n")

    print("Saving the classifier")
    data_io.save_model(classifier)
   
    print("Predicting the train set")
    train_predict = classifier.predict(train)
    trian_predict = train_predict.flatten()
    data_io.write_submission(train_predict, 'train_set', run = 'train')

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
示例#11
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()

    X, y, info = exploit_symmetries(X, y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target'] == -1])
    print "0", len(y[y['Target'] == 0])
    print "1", len(y[y['Target'] == 1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
示例#12
0
def main():

    X = data_io.read_train_pairs()
    y = data_io.read_train_target()
    info = data_io.read_train_info()


    X,y, info = exploit_symmetries(X,y, info)
    print X.shape, y.shape
    print "-1", len(y[y['Target']==-1])
    print "0", len(y[y['Target']==0])
    print "1", len(y[y['Target']==1])

    # X = X.iloc[:10]
    # y = y.iloc[:10]
    # info = info.iloc[:10]

    data_io.save_train_data(X, "./Competition/CEfinal_train_pairs-sym.csv")
    data_io.save(y, "./Competition/CEfinal_train_target-sym.csv")
    data_io.save(info, "./Competition/CEfinal_train_publicinfo-sym.csv")
    print "finished"
示例#13
0
    probb = np.bincount(b.astype('int32')).astype('float64') / len(b)

    proba_nz = proba[np.nonzero(proba)]
    probb_nz = probb[np.nonzero(probb)]

    jointp = np.outer(proba_nz, probb_nz)
    hpos = np.sum(np.log(jointp) * jointp)
    return -hpos


if __name__ == '__main__':

    print 'Reading in {} data...'.format(DATA)

    if DATA == 'train':
        info = data_io.read_train_info()
        train = data_io.read_train_pairs()
    elif DATA == 'valid':
        info = data_io.read_valid_info()
        train = data_io.read_valid_pairs()
    else:
        raise ValueError

    print 'Saving coded info matrix...'
    codes = np.zeros(info.values.shape)
    lookup = {'Numerical': 1, 'Categorical': 2, 'Binary': 3}
    for i, t in enumerate(info.values):
        a, b = t
        codes[i, :] = [lookup[a], lookup[b]]

    savemat('matlab/{}info.mat'.format(DATA), {'codes': codes},
示例#14
0
def get_pipeline():
    features = feature_extractor()
    steps = [("extract_features", features),
             ("classify", RandomForestRegressor(n_estimators=50, 
                                                verbose=2,
                                                n_jobs=2,
                                                min_samples_split=10,
                                                random_state=1,
                                                compute_importances=True))]
    return Pipeline(steps)

if __name__=="__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

### FOLDS CODE
#    folds = cval.KFold(len(train), n_folds=2, indices=False)
#   
#    results = []
#    for i, fold in enumerate(folds):
#        print("Extracting features and training model for fold " + str(i))
#        traincv, testcv = fold
#        classifier.fit(train[traincv], target[traincv])
#        results.append(classifier.score(train[testcv], target[testcv]))
示例#15
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train = train.append(sup)
      target = target.append(sup_target)

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    (feature_trans, classifier) = get_pipeline()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    print("Saving features")
    data_io.save_features(orig_train)

    classifier.fit(trainX, trainY)
    print("Saving the classifier")
    data_io.save_model(classifier)
 
    testX = numpy.nan_to_num(testX)
    print "Score on held-out test data ->", classifier.score(testX, testY)
    
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    feature_importrance = classifier.feature_importances_
    logger = open("feature_importance.csv","a")
    for fi in feature_importrance:
      logger.write(str(fi))
      logger.write("\n")

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
示例#16
0
def main():
    global cf

    start = time.clock()

    numRows = None
    cv = False
    nfold = 10
    clf_keys = ["rfg"]

    try:
        opts, args = getopt.getopt(sys.argv[1:], "n:c:m:h")
    except getopt.GetoptError as err:
        print str(err)
        sys.exit(2)

    clamp = lambda n, minn, maxn: max(min(maxn, n), minn)
    for o, a in opts:
        if o == "-n":
            numRows = clamp(int(a), 1, 4050)
        elif o == "-c":
            cv = True
            nfold = int(a)
        elif o == "-m":
            if a == "all":
                clf_keys = []
                for clf_key in cf.get_all_keys():
                    clf_keys.append(clf_key)
            elif cf.is_valid_key(a):
                clf_keys = [a]
            else:
                print "ERROR: wrong classifier name: " + a
        elif o == "-h":
            print 'options:'
            print "\t -n [number of rows]"
            print "\t -c [number of folds]"
            print "\t -m [classifier key | all]"
            sys.exit(0)
        else:
            print "try help: python train.py -h"
            sys.exit(2)

    print("Reading in the training data")
    train = data_io.read_train_pairs(numRows)
    trainInfo = data_io.read_train_info(numRows)
    train['A type'] = trainInfo['A type']
    train['B type'] = trainInfo['B type']
    target = data_io.read_train_target(numRows)

    if cv:
        data = {}
        data['train'] = train
        data['target'] = target

        for clf_key in clf_keys:
            print "Initiating " + str(nfold) + " fold cross validation with classifier " + cf.get_classifier_name(clf_key)
            crossvalidate(data, nfold, clf_key)
    else:
        for clf_key in clf_keys:
            start_train = time.clock()
            print("Extracting features and training model")
            classifier = get_pipeline(clf_key)
            classifier.fit(train, target.Target)

            print("Saving the classifier")
            data_io.save_model(classifier, clf_key)
            end_train = time.clock()
            print 'time taken:', end_train - start_train, 'seconds'

    end = time.clock()

    print 'Execution time:', round(end - start, 2)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train_info = train_info.append(sup_info)
      train = train.append(sup)
      target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier) 
 
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1, 4):
        print "SUP", str(i)
        sup = data_io.read_sup_pairs(i)
        sup_info = data_io.read_sup_info(i)
        sup = combine_types(sup, sup_info)
        sup = get_types(sup)
        sup_target = data_io.read_sup_target(i)
        train_info = train_info.append(sup_info)
        train = train.append(sup)
        target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train)

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier)

    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff / 60, 1)