Exemplo n.º 1
0
 def _variance_threshhold(self, variance):
     '''Remove columns that do not meat the variance threshold'''
     logging.info('Removing data that has variance less than %f.' %(variance))
     vt = VarianceThreshold(variance)
     vt.fit(self.X) # XXX: Because idx should have high variance we pas all of X
     self.X = vt.transform(self.X)
     self.X_submit = vt.transform(self.X_submit)
     
     # Repeat this process for X_submit # XXX: This might not be kosher outside of competition
     vt.fit(self.X_submit)
     self.X = vt.transform(self.X)
     self.X_submit = vt.transform(self.X_submit)
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    train_x_clean, contentdict = cityclean(train_x_new)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    test_x_clean, contentdict = cityclean(test_x_new, contentdict)
    del contentdict
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    test_x_uniq = sel.transform(test_x_clean)
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
    
    #feature selection and modeling
    print "feature selection and modeling"
    exclusivefs(train_x_nor, train_y, test_x_nor, test_y)
Exemplo n.º 3
0
def main():
    args = getOptions()
    print args

    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear")
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    rfecv.fit(train_x_nor, train_y)
    
    print("Optimal number of features : %d" % rfecv.n_features_)
Exemplo n.º 4
0
    def vectorize_EX(self, columns, variance_thresh=0, train_only=False):

        print('Start vectorizing')
        start_time = time.time()
        hasher = CountVectorizer(binary=True, tokenizer=LemmaTokenizer(), stop_words='english')

        train_dtm = hasher.fit_transform(
            self.ga_bm_train[columns].apply(lambda x: ','.join(x), axis=1))
        print(hasher.get_feature_names())
        print('dtm train shape: ', train_dtm.shape)

        selector = VarianceThreshold(variance_thresh)
        train_dtm = selector.fit_transform(train_dtm)
        print('dtm train shape after variance thresh: ', train_dtm.shape)

        if not train_only:
            test_dtm = hasher.transform(
                self.ga_bm_test[columns].apply(lambda x: ','.join(x), axis=1))

            print('dtm test shape: ', test_dtm.shape)
            test_dtm = selector.transform(test_dtm)
            print('dtm test shape after variance thresh: ', test_dtm.shape)

        print("Time: ", round(((time.time() - start_time)/60), 2))
        print('Complete vectorizing')
        if train_only:
            return train_dtm
        else:
            return (train_dtm, test_dtm)
Exemplo n.º 5
0
def main():
    args = getOptions()
    print args
    fn = "destreeSub.csv"
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    clf = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0, class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
def main():
    args = getOptions()
    fn = ("submission_cor_%s_%s_%s.csv" % (str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth)))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
#     ftsel = correlationSel()
#     ftsel.dosel(train_x_nor,train_y)
#     train_x_sel = ftsel.transform(train_x_nor)
#     test_x_sel = ftsel.transform(test_x_nor)
    print "modelsing"
    clf = GradientBoostingClassifier(loss='deviance', 
                                     learning_rate=args.lrate,
                                     n_estimators=args.nest,
                                     max_depth=args.maxdepth,
                                     verbose=1)
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
Exemplo n.º 7
0
def featureSelection(X_train,X_test,X_val,y_train,log,tech,C):
    if (tech == 'VarTh'):
        sel = VarianceThreshold(threshold=0.01)
        X_train_new = sel.fit_transform(X_train.todense())
        X_test_new = sel.transform(X_test.todense())
        X_val_new = sel.transform(X_val.todense())
        if (log):
            X_train_new = np.log(X_train_new+1)
            X_test_new = np.log(X_test_new+1)
            X_val_new = np.log(X_val_new+1)
    
    if (tech == 'LinearSVC'):
        mod = LinearSVC(C=C, penalty="l1", dual=False)
        X_train_new = mod.fit_transform(X_train.todense(), y_train)
        X_test_new = mod.transform(X_test.todense())
        X_val_new = mod.transform(X_val.todense())
        if (log):
            X_train_new = np.log(X_train_new+1)
            X_test_new = np.log(X_test_new+1)
            X_val_new = np.log(X_val_new+1)
    return X_train_new, X_test_new , X_val_new
Exemplo n.º 8
0
class VarianceThresholdStep(SklearnStep):
    def __init__(self, threshold):
        super(VarianceThresholdStep, self).__init__()
        self._threshold = threshold

    def fit_transform(self):
        self._model = VarianceThreshold(threshold=self._threshold)
        x, y = load_svmlight(self.input_path)
        x = self._model.fit_transform(x, y)
        save_svmlight(x, y, self._output_path)

    def transform(self, x=None):
        if x is None:
            x, y = load_svmlight(self._test_input_path)
            x = self._model.transform(x)
            save_svmlight(x, y, self._test_output_path)
        else:
            transformed_x = self._model.transform(x)
            return transformed_x

    def get_param(self):
        return {'threshold': self._threshold}
def main():
    args = getOptions()
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)

    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_new)
    test_x_uniq = sel.transform(test_x_new)
    
    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)
    
    #feature selection
    print "feature selection"
    ftsel = ExtraTreesClassifier()
    ftsel.fit(train_x_nor, train_y)
#     importances = ftsel.feature_importances_
#     indices_test = np.argsort(importances)[::-1]
#     indices_test = indices_test.tolist()
    train_x_trans = ftsel.transform(train_x_nor)
    test_x_trans = ftsel.transform(test_x_nor)
    
    #modelsing
    print "modelsing"
    train = xgb.DMatrix(train_x_trans,label=train_y)
    test = xgb.DMatrix(test_x_trans,label=test_y)
    gbm = xgb.train({'max_depth':3, 'n_estimators':1500, 'learning_rate':0.1 ,'objective':'binary:logistic','eval_metric':'auc'},train)
    train_pdt = gbm.predict(train)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = gbm.predict(test)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(test): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open("submission_xgbtrain.csv",'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index])))
    fout.close()
Exemplo n.º 10
0
def chooseFeatures(train_x,train_y,test_x,kB):
    
    sel             = VarianceThreshold()
    trainingX       = sel.fit_transform(train_x)
    testingExamples = sel.transform(test_x)
    

    if kB > trainingX.shape[1]:
        kB  = trainingX.shape[1]
    
    kBest   = SelectKBest(chi2,k=kB)
    train_x = kBest.fit_transform(train_x,train_y)
    test_x  = kBest.transform(test_x)

    return train_x,test_x
Exemplo n.º 11
0
def runRF(directory, name):
  print(time.time(), time.clock())
  test = directory + "transformed.test.SquibDWTFFT.npy"
  train = directory + "transformed.train.SquibDWTFFT.npy"
  data = loadTransformed(train)
  X,Y = splitData(data)
  y = Y.flatten()
  ocp = np.sum(y == 1)
  oci = np.sum(y == 0)
  ratio = oci.astype(float) / ocp.astype(float) 
  nest = int(300 * ratio.round())
  print ratio.round()
  threshold = 0.8
  sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
  Xr =  sel.fit_transform(X)
  #clf = RandomForestClassifier(n_estimators=10,  criterion='entropy',max_depth=None,min_samples_split=1, random_state=0)
  print "Fit Random forest on " + name
  clf = RandomForestClassifier(n_estimators=nest , criterion = "entropy", max_features="auto", min_samples_split=1, bootstrap=False, n_jobs=8, random_state=0)
  clf.fit(Xr, y, sample_weight= np.array([ratio.round() if i == 0 else 1 for i in y]))
  print(time.time(), time.clock())
  print "Predict " + name
  dtest = loadTransformed(test)
  XT,YT = splitData(dtest)
  yt = YT.flatten()
  XTr =  sel.transform(XT)
  predictions = clf.predict_proba(XTr)
  filename = name + "_predictions.csv"
  print(time.time(), time.clock())
  print "Writting out results ... "
  finalFile = open(filename, "w")
  count = 1
  for pre in predictions:
    pr = '%.6f' %  pre[1]
    count4 = "%04d" % (count,)
    print name + " " + count4 + " " + pr
    finalFile.write(name + "_test_segment_" + count4 + ".mat," + str(pr) + "\n")
    count += 1
  finalFile.close()
  print(time.time(), time.clock())
Exemplo n.º 12
0
def get_rmvar(train_x, test_x, threshold=20):
    selector = VarianceThreshold(threshold=20)
    selector.fit(train_x)
    train_var = selector.transform(train_x)
    test_var = selector.transform(test_x)
    return train_var, test_var
Exemplo n.º 13
0
def main():
    args = getOptions()
    print args
    if args.model == 'gBoosting':
        fn = ("submissionv4_%s_gBoosting_%s_%s_%s_%s_%s.csv" %
              (args.fts, args.loss, str(args.minsamplessplit), str(
                  args.lrate).replace('.', 'dian'), str(
                      args.nest), str(args.maxdepth)))
    elif args.model == 'randomForest':
        fn = ("submissionv4_%s_randomForest_%s.csv" % (args.fts, args.nest))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train, 'train')
    train_x_new, id = extractID(train_x)
    train_x_clean, contentdict = cityclean(train_x_new)

    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test, 'test')
    test_x_new, id = extractID(test_x)
    test_x_clean, contentdict = cityclean(test_x_new, contentdict)
    del contentdict
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"

    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    test_x_uniq = sel.transform(test_x_clean)
    #     indices = [i for i in range(len(train_x[0]))]
    #     frqIndex = trimfrq(train_x)
    #     for i in frqIndex:
    #         indices.remove(i)
    #     train_x_uniq = indexTodata(train_x, indices)
    #     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y,
                                                    test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y,
                                                   test_x_nor)
    elif args.fts == 'randomTree':
        train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y,
                                                    test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    print len(train_x_nor[0])
    print len(train_x_sel[0])

    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    if args.model == 'gBoosting':
        clf = GradientBoostingClassifier(
            loss=args.loss,
            learning_rate=args.lrate,
            n_estimators=args.nest,
            max_depth=args.maxdepth,
            min_samples_split=args.minsamplessplit,
            verbose=1)
    elif args.model == 'randomForest':
        clf = RandomForestClassifier(n_estimators=args.nest,
                                     class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p, Acc_n, Acc_all = get_Accs(train_y, train_pdt)
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p), str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
    #     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt)
    #     print "MCC, Acc_p , Acc_n, Acc_all(test): "
    #     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))

    fout = open(fn, 'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])), str(test_pdt[index][1])))
    fout.close()
Exemplo n.º 14
0
# -*- coding: utf-8 -*-
"""
Created on Tue May 29 18:16:50 2018

@author: juanferna.perez
"""

from pandas import DataFrame
from sklearn.feature_selection import VarianceThreshold
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Xdf= DataFrame(X)
print(Xdf.describe())
print(Xdf.var(ddof=0))

selector = VarianceThreshold(threshold=(0.8*0.8 ))
selector.fit(X)
print(selector.get_support())

Xbar = selector.transform(X)
print(Xbar)



Exemplo n.º 15
0
def remove_low_var_features(xtrain, xtest):
    selector = VarianceThreshold()
    xtrain = selector.fit_transform(xtrain)
    xtest = selector.transform(xtest)
    return xtrain, xtest, selector.get_support(indices=True)
Exemplo n.º 16
0
    XX = []
    for i in xrange(len(Y)):
        if Y[i] == value:
            XX.append(X[i])
    return XX


out = open(sys.argv[1], "r")
model = svm.OneClassSVM(kernel='rbf')
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sample_data(sel.fit_transform(X), Y, 1))
warning("useful features dim: " + str(len(sel.get_support(True))))
if hasattr(model, 'score'):
    warning("accuracy on training set: " +
            str(model.score(sel.transform(X), Y)))
    if len(sys.argv) > 2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y)))

    if len(sys.argv) > 3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: " +
                str(model.score(sel.transform(X), Y)))

if len(sys.argv) > 4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
    Z = model.decision_function(sel.transform(X)).tolist()
    Z = (Z - ref.mean()) / ref.std()
    for i in xrange(len(Y)):
def compute(train, test):

  #Train data
  train_X              = [];
  train_restaurant_ids = [];
  test_X               = [];
  test_restaurant_ids  = [];
  train_Y              = [];

  #Common feature values in train/test
  train_feature_val    = {};
  test_feature_val     = {};

  build_FeatureVal(train, train_feature_val);
  build_FeatureVal(test, test_feature_val);
 
  buildFeatures(train, train_feature_val, test_feature_val, train_X, train_Y, train_restaurant_ids, "train");
  buildFeatures(test, train_feature_val, test_feature_val, test_X, None, test_restaurant_ids, "test");


  train_Y = np.array(train_Y);

  enc = OneHotEncoder(categorical_features=np.array([3,4,5,32,33,34,35,36,37,38,39,40,41,42]), sparse=False, n_values=100);

  enc.fit(test_X);

  train_X = enc.transform(train_X);
  test_X  = enc.transform(test_X);

  print("No of train features " +  str(len(train_X[0])));
  print("No of test features " +  str(len(test_X[0])));

  #Remove features with similar values
  selector = VarianceThreshold();
  selector.fit(train_X);
  train_X = selector.transform(train_X);
  test_X = selector.transform(test_X);

  print("No of train features " +  str(len(train_X[0])));
  print("No of test features " +  str(len(test_X[0])));

  
  parameters_to_try = generateParams();
  print("No of Paramters to test " + str(len(parameters_to_try)));

  #Contruct parameters as s list
  models_to_try     = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) for i in range(0, len(parameters_to_try)) ];

  #Create a Thread pool.
  pool              = Pool(8);
  results           = pool.map( train_model_wrapper, models_to_try );

  pool.close();
  pool.join();


  best_params       = None;
  best_rmse         = sys.float_info.max;
  for i in range(0, len(results)):
    if results[i][1] < best_rmse:
        best_rmse   = results[i][1];
        best_params = results[i][0];

  print("Best Params : " + str(best_params));
  print("Best RMSE :   " + str(best_rmse));

  #estimator               = SVR(**params)
  #estimator               = RandomForestRegressor(**best_params)
  estimator                = GradientBoostingRegressor(**best_params)


  estimator.fit(train_X, train_Y);

  print("Writing Output");
  predict_and_save(estimator, test_X, test_restaurant_ids);
Exemplo n.º 18
0
    def preprocess(self):
        print 'Preprocess...'
        print 'Start: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')        
        data = self.data.copy()
        label = self.label.copy()
        
        m = data.shape[0]
        print data['MarriageStatus'].dtype
        
        #fillna
        for i in data.columns:
            if i!='AppId' and i!='InstallmentStartedOn':
                if data[i].hasnans:
                    t0=pd.DataFrame(np.ones((data.shape[0],1),dtype=np.int),columns=[i+'_Ex'],index=data.index)
                    ind0=data[data[i].isnull()].index
                    t0.ix[ind0]=0
                    data[i+'_Ex']=t0
                    
                    if data[i].dtype==np.object:
                        if data[i].value_counts().sort_values().shape[0]>0:
                            data[i].fillna(data[i].value_counts().sort_values().index[-1],inplace=True,downcast='infer')
                        else:
                            data[i].fillna('0',inplace=True,downcast='infer')
                    else:
                        if np.isnan(data[i].mean())==False:
                            data[i].fillna(data[i].mean(),inplace=True,downcast='infer')
                        else:
                            data[i].fillna(0,inplace=True,downcast='infer')
                            
        train,train_label,test,test_label=self.split(data,label)

        self.raw_train=train.copy()
        self.raw_train_label=train_label.copy()
        self.raw_test=test.copy()
        self.raw_test_label=test_label.copy()
        
        #delete AppId and InstallmentStartedOn
        data.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        train.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        test.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        
        data.reset_index(inplace=True,drop=True)
        train.reset_index(inplace=True,drop=True)
        test.reset_index(inplace=True,drop=True)
        
        #preprocess 
        enc0=LabelEncoder()
        enc1 = OneHotEncoder()
        scaler = MinMaxScaler()
        
        for i in train.columns:
            if train[i].dtype==np.object:
                t0=enc0.fit_transform(train[i].values.reshape(-1,1))
                t1=enc1.fit_transform(t0.reshape(-1,1)).toarray()
                tf=pd.DataFrame(t1,index=train.index)
                tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True)
                train.drop(i,inplace=True,axis=1)
                train=train.join(tf,how='inner')

                clas = enc0.classes_
                if test[i][~test[i].isin(clas)].size != 0:
                    ind = test[i][~test[i].isin(clas)].index
                    test[i].iloc[ind] = clas[0]
                    
                t0=enc0.transform(test[i].values.reshape(-1,1))
                t1=enc1.transform(t0.reshape(-1,1)).toarray()
                tf=pd.DataFrame(t1,index=test.index)
                tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True)
                test.drop(i,inplace=True,axis=1)
                test=test.join(tf,how='inner')              
            else:
                tt0=train[i].values.reshape(-1,1)
                tt0_s=scaler.fit_transform(tt0)
                train[i+'_S']=tt0_s
                train.drop(i,inplace=True,axis=1)               
               
                tt2=test[i].values.reshape(-1,1)
                tt2_s=scaler.transform(tt2)      
                test[i+'_S']=tt2_s
                test.drop(i,inplace=True,axis=1)
        
        #feature selection
        sel = VarianceThreshold(threshold=0.0002)
        train_new=sel.fit_transform(train)
        sup=sel.get_support()
        features=train.columns.tolist()
        for i in xrange(train.shape[1]):
            if sup[i]==False:
                features.remove(train.columns[i])
        
        train=pd.DataFrame(train_new,columns=features)
        
        test_new=sel.transform(test)
        test=pd.DataFrame(test_new,columns=features)
        
        self.train=train.copy()
        self.train_label=train_label.copy()
        self.test=test.copy()
        self.test_label=test_label.copy()
        
        print 'End: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')        
        return train,train_label,test,test_label
Exemplo n.º 19
0
selector = VarianceThreshold()
feature_train = selector.fit_transform(feature_train)
scaler = preprocessing.StandardScaler().fit(feature_train)
feature_train = scaler.transform(feature_train)

feature_ER = np.zeros((length_ER, num_feature))
textfile = open(data_path + "ER_" + feature_type)
count = 0
while length_ER > count:
    x = textfile.readline()
    x = x.strip()
    result = np.array([list(map(float, x.split()))])
    feature_ER[count, ] = result
    count = count + 1

feature_ER = selector.transform(feature_ER)
feature_ER = scaler.transform(feature_ER)

feature_GPCR = np.zeros((length_GPCR, num_feature))
textfile = open(data_path + "GPCR_" + feature_type)
count = 0
while length_GPCR > count:
    x = textfile.readline()
    x = x.strip()
    result = np.array([list(map(float, x.split()))])
    feature_GPCR[count, ] = result
    count = count + 1

feature_GPCR = selector.transform(feature_GPCR)
feature_GPCR = scaler.transform(feature_GPCR)
Exemplo n.º 20
0
# Import data-sets
train_exp, train_cnv, train_ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets()

# Setup
genes = train_ess.axes[1]
samples = leader_exp.axes[0]
predictions = DataFrame(None, index=genes, columns=samples)
spearman = make_scorer(spearm_cor_func, greater_is_better=True)

X_train_pre = train_exp
X_test_pre = leader_exp

# Filter by coeficient variation
var_thres = VarianceThreshold(best_var).fit(X_train_pre)
X_train_pre = var_thres.transform(X_train_pre)
X_test_pre = var_thres.transform(X_test_pre)

for gene in genes:
    # Assemble prediction variables
    X_train = X_train_pre
    y_train = train_ess.ix[:, gene]
    X_test = X_test_pre

    # Feature selection
    fs = SelectKBest(f_regression, k=best_k).fit(X_train, y_train)
    X_train = fs.transform(X_train)
    X_test = fs.transform(X_test)

    # Estimation
    clf = PassiveAggressiveRegressor(epsilon=best_epsilon, n_iter=best_n_iter).fit(X_train, y_train)
Exemplo n.º 21
0
def get_low_variance_columns(dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False):
    """
    Wrapper for sklearn VarianceThreshold for use on pandas dataframes.
    """
    print("Finding low-variance features.")
    try:
        # get list of all the original df columns
        all_columns = dframe.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=thresh)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns, feature_names))
        print("Found {0} low-variance columns.".format(len(removed_features)))

        # remove the columns
        if autoremove:
            print("Removing low-variance features.")
            # remove the low-variance columns
            X_removed = vt.transform(X)

            print("Reassembling the dataframe (with low-variance " "features removed).")
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed, columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index, column=skip_columns[idx], value=skipped_values[:, idx])
            print("Succesfully removed low-variance columns.")

        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something " "went wrong.")
        pass

    return dframe
Exemplo n.º 22
0
labels_cols = train.columns[0]

features_train = train[features_cols]
labels_train = train[labels_cols]


features_test = test

#Create cross-validation set
train_X, test_X, train_y, test_y = cross_validation.train_test_split(features_train, labels_train, test_size = 0.2, random_state=0)


#Feature selection
sel = VarianceThreshold(threshold=(.8*(1-.8)))
sel.fit_transform(train_X[:5000])
sel.transform(test_X[:5000])

#Create and train classifier
clf = GaussianNB()

clf.fit(train_X, train_y)

#Get accuracy score
pred_train = clf.predict(train_X[:5000])
pred_test = clf.predict(test_X[:5000])

accuracy_train = accuracy_score(train_y[:5000], pred_train)
accuracy_test = accuracy_score(test_y[:5000], pred_test)

print('Accuracy score on training data is: ' + str(accuracy_train))
print('Accuracy score on testing data is: ' + str(accuracy_test))
Exemplo n.º 23
0
	tmpList, par = mult_clean_list(el[i],t)
	cln.extend(tmpList)
	t += 1
print "HERE"

X = []
print sz

for j in range(0,sz):
	tmp=[]
	for i in range(0,len(cln)):
		if(parList[i] not in blockList):
			tmp.append(cln[i][j])
	X.append(tmp)

X = sel.transform(X)

for j in np.array(X[:3]):
	print j

py = clf.predict(X)

ans = []

for i in range(0,len(ids)):
	ans.append([ids[i],py[i]])

sorted(ans, key=lambda x: x[0])

with open('results.csv', 'wb') as testfile:
    csv_writer = csv.writer(testfile)
Exemplo n.º 24
0
def remove_features_with_low_variance(x_data):
    variance = VarianceThreshold(threshold=1.4)
    print ('before transform', len(x_data[4]), x_data[4])
    variance.fit(x_data)
    transformed_x = variance.transform(x_data)
    print ('after transform', len(transformed_x[4]), transformed_x[4])
Exemplo n.º 25
0
    X_train, y_train = rus.fit_sample(X_train, y_train)
    radioFeat_train = copy.deepcopy(X_train[:, :1692])
    clinical_semanticFeat_train = copy.deepcopy(X_train[:, 1692:])
    radioFeat_test = copy.deepcopy(X_test.iloc[:, :1692])
    clinical_semanticFeat_test = copy.deepcopy(X_test.iloc[:, 1692:])
    print('------------------开始特征选择---------------------')
    print('radiomics原始特征个数为:{}'.format(radioFeat_train.shape[1]))
    print('clinical_semantic原始特征个数为:{}'.format(
        clinical_semanticFeat_train.shape[1]))

    ##################方差特征选择################
    from sklearn.feature_selection import VarianceThreshold  # 导入python的相关模块
    vad = VarianceThreshold(
        threshold=0.01)  # 表示剔除特征的方差大于阈值的特征Removing features with low variance
    radioFeat_train = vad.fit_transform(radioFeat_train)  # 返回的结果为选择的特征矩阵
    radioFeat_test = vad.transform(radioFeat_test)
    print('train_test_split_seed={} 方差选择radiomics特征个数为:{}'.format(
        seeds, radioFeat_train.shape[1]))

    ######################特征归一化到【-1,1】之间#####################
    # max_abs_scaler = preprocessing.MaxAbsScaler()
    # max_abs_scaler.fit(xmantrain)
    # xabstrain = max_abs_scaler.transform(xmantrain)
    # xabstest = max_abs_scaler.transform(xmantest)
    ##################方差特征选择################
    # from sklearn.feature_selection import VarianceThreshold  # 导入python的相关模块
    # sel = VarianceThreshold(threshold=0.01)  # 表示剔除特征的方差大于阈值的特征Removing features with low variance
    # ss = sel.fit(xmantrain)  # 返回的结果为选择的特征矩阵
    # xvartrain = sel.transform(xmantrain)
    # xvartest = sel.transform(xmantest)
    # print("方差特征选择后特征个数:", xvartest.shape[1])
Exemplo n.º 26
0
#############################################################################
#
# Feature Selection
#
##########################################

#Low Variance Filter
if lv_filter == 1:
    print('--LOW VARIANCE FILTER ON--', '\n')

    #LV Threshold
    sel = VarianceThreshold(
        threshold=0.5)  #Removes any feature with less than 20% variance
    fit_mod = sel.fit(data_np)
    fitted = sel.transform(data_np)
    sel_idx = fit_mod.get_support()

    #Get lists of selected and non-selected features (names and indexes)
    temp = []
    temp_idx = []
    temp_del = []
    for i in range(len(data_np[0])):
        if sel_idx[i] == 1:  #Selected Features get added to temp header
            temp.append(header[i + feat_start])
            temp_idx.append(i)
        else:  #Indexes of non-selected features get added to delete array
            temp_del.append(i)

    print('Selected', temp)
    print('Features (total, selected):', len(data_np[0]), len(temp))
Exemplo n.º 27
0
	def fit(self, x_or, y, w=None):
		""" Fits upper and lower bounds on p(y|x) """

		if self.standardize:
			xselector = VarianceThreshold(threshold=.1).fit(x_or)
			temp_x = xselector.transform(x_or)
			xscaler = StandardScaler().fit(temp_x)
			self.xscaler = lambda x: xscaler.transform(xselector.transform(x))
			x = self.xscaler(x_or)
		else:
			x = x_or.copy()

		if self.kernel == 'linear':
			self.kernel_fit = lambda x: x
			x = self.kernel_fit(x)

		elif self.kernel == 'poly':
			if self.p is None:
				raise ValueError('Need polynomial value')

			self.kernel_fit = lambda x: np.hstack([x**i for i in range(1, self.p + 1)])
			x = self.kernel_fit(x)

		elif self.kernel == 'rbf':
			if self.sig is None:
				raise ValueError('Need Length scale value')
			self.x_tr = x.copy()
			self.kernel_fit = lambda x_ts: RBF(length_scale=self.sig).__call__(x_ts,
				self.x_tr)
			x = self.kernel_fit(x)

		elif self.kernel == 'rbf_approx':
			if self.sig is None:
				raise ValueError('Need Length scale value')

			rbf_fit = RBFSampler(gamma=1 / self.sig, n_components=50).fit(x.copy())
			self.kernel_fit = lambda x_ts: rbf_fit.transform(x_ts)
			x = self.kernel_fit(x)

		n, d = x.shape[0], x.shape[1]
		mdl = grb.Model("qp")
		mdl.ModelSense = 1
		mdl.setParam('OutputFlag', False)
		mdl.reset()

		L = 1e5
		us = [mdl.addVar(name="u%d" % i, lb=-L, ub=L) for i in range(n)]
		ls = [mdl.addVar(name="l%d" % i, lb=-L, ub=L) for i in range(n)]
		bsU = [mdl.addVar(name="bu%d" % i, lb=-L, ub=L) for i in range(d + 1)]
		bsL = [mdl.addVar(name="bl%d" % i, lb=-L, ub=L) for i in range(d + 1)]
		rUs = [mdl.addVar(name="ru%d" % i, lb=0, ub=L) for i in range(n)]
		rLs = [mdl.addVar(name="rl%d" % i, lb=0, ub=L) for i in range(n)]

		slackU = 0
		slackL = 0

		if w is None:
			w = np.ones(n) / n

		obj_terms = []
		for i in range(n):
			mdl.addConstr(us[i] >= ls[i])

			mdl.addConstr(us[i] == np.dot(x[i, ], bsU[:d]) + bsU[-1])
			mdl.addConstr(ls[i] == np.dot(x[i, ], bsL[:d]) + bsL[-1])

			mdl.addConstr(rUs[i] >= y[i] - us[i])
			mdl.addConstr(rLs[i] >= ls[i] - y[i])

			slackU += w[i] * rUs[i]
			slackL += w[i] * rLs[i]

			if self.loss == 'square':
				obj_terms.append(w[i] * (us[i] - ls[i]) * (us[i] - ls[i]))
			elif self.loss == 'linear':
				if self.agg == 'max':
					obj_terms.append((us[i] - ls[i]))
				else:
					obj_terms.append(w[i] * (us[i] - ls[i]))

			else:
				raise Exception('Unrecognized loss: %s' % self.loss)

		if self.agg == 'max':
			o = mdl.addVar(name="o", lb=-L, ub=L)
			os = []
			for i in range(n):
				oi = mdl.addVar(name="o%d" % i, lb=-L, ub=L)
				mdl.addConstr(oi == obj_terms[i])
				os += [oi]
			mdl.addConstr(o == grb.max_(os))
			obj = o
		else:
			obj = grb.quicksum(obj_terms)

		# ----add the values of the objectives
		obj_reg_u, obj_reg_l = 0, 0
		for k in range(d):
			obj_reg_u += bsU[k] * bsU[k]
			obj_reg_l += bsL[k] * bsL[k]

		obj_reg = self.alphau * obj_reg_u + self.alphal * obj_reg_l

		mdl.addConstr(slackU <= self.lamdau)
		mdl.addConstr(slackL <= self.lamdal)
		obj_f = obj + obj_reg

		mdl.setObjective(obj_f)
		mdl.optimize()

		self.bu = np.array([bsU[j].x for j in range(d + 1)])
		self.bl = np.array([bsL[j].x for j in range(d + 1)])

		# print(obj.getValue(), obj_slack.getValue())

		return self
__author__ = 'pierregagliardi'

import numpy as np
import pickle
from sklearn.feature_selection import VarianceThreshold

from projet_sentiment_analysis.code.utilities import extract_data

if __name__ == "__main__":

    general_path = '/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/'
    path_to_training_set = general_path + 'training_set_60000/training_set_unigram_all_features/'
    path_to_pickle = general_path + 'pickle_hyper_parameters/'

    (X_train, y_train, X_test, y_test, number_training,
     number_testing) = extract_data.extract_training_and_testing_set(
         path_to_training_set + 'metrics_training_set_7000.data',
         path_to_training_set + 'metrics_testing_set_7000.data')

    sel = VarianceThreshold(threshold=(.999 * (1 - .999)))
    X_train = sel.fit_transform(X_train)
    X_test = sel.transform(X_test)

    with open(path_to_pickle + 'metrics_60000_all_features_7000.pkl',
              'wb') as fid:
        pickle.dump((X_train, y_train, X_test, y_test), fid)
Exemplo n.º 29
0
def main():
    train_data = pd.read_csv(train_path, index_col = 'Id')
    kaggl_data = pd.read_csv(kaggl_path,  index_col = 'Id')

    # Train/Test Split

    X = train_data.drop('SalePrice', axis=1)
    y = train_data['SalePrice']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

    print('Training data has {} rows.'.format(X_train.shape[0]))
    print('Testing data has {} rows.'.format(X_test.shape[0]))
    print('Kaggle data has {} rows.'.format(kaggl_data.shape[0]))

    # Manual Feature Engineering
    print('Manual Feature Engineering...')

    # Create an 'EDA' dataframe we'll use to do some exploring
    EDA = X_train.copy()
    EDA['SalePrice'] = y_train

    # There are 27 neighborhoods. Let's put them into groups of 9:
    neighborhood_ranks = EDA.groupby('Neighborhood')['SalePrice'].mean().sort_values().index

    low_neigh  = neighborhood_ranks[:9]
    mid_neigh  = neighborhood_ranks[9:18]
    high_neigh = neighborhood_ranks[18:]

    def manual_feature_eng(data):
        '''Some basic manual feature engineering based on EDA of X_train'''
        eng_data = data.copy()
        # Years info:
        eng_data['Years_Old'] = 2018 - eng_data['Year Built']
        eng_data['Garage Age'] = 2018 - eng_data['Garage Yr Blt']
        eng_data['Years Since Sale'] = 2018 - eng_data['Yr Sold']
        eng_data['Years Since Remodel'] = 2018 - eng_data['Year Remod/Add']
        eng_data.drop(['Year Built','Garage Yr Blt','Yr Sold','Year Remod/Add'],
                    axis=1, inplace=True)
        # Neighborhood info:
        eng_data['High_Neigh'] = [1 if x in high_neigh else 0 for x in eng_data['Neighborhood']]
        eng_data['Mid_Neigh'] = [1 if x in mid_neigh else 0 for x in eng_data['Neighborhood']]
        eng_data['Low_Neigh'] = [1 if x in low_neigh else 0 for x in eng_data['Neighborhood']]
        eng_data.drop('Neighborhood', axis=1, inplace=True)
        
        # Is there miscellaneous furniture?
        eng_data['MiscFurn'] = eng_data['Misc Val'] > 0
        return eng_data

    X_train = manual_feature_eng(X_train)
    X_test = manual_feature_eng(X_test)
    kaggl_data = manual_feature_eng(kaggl_data)

    # Data Preprocessing: Categorical Data
    print('Processing Categorical Data...')

    # Before we begin, let's check to see if there are any columns in the Kaggle 
    # set that aren't in the training set:

    assert [col for col in kaggl_data.columns if col not in X_train.columns] == []

    # And vice versa:

    assert [col for col in X_train.columns if col not in kaggl_data.columns] == []

    # All of our preprocessing will ultimately go here:
    def preprocessing(data):
        try:
            cleaned_data = data.drop('PID', axis=1)
        except:
            cleaned_data = data
        fillna_dict = {
            'Pool QC':'No Pool',
            'Alley':'No Alley',
            # Let's let the get_dummies drop 'Misc Features' if NA
            'Fence':'No Fence',
            'Fireplace Qu':'No Fireplace',
            # Lot frontage can be mean imputed
            'Garaga Finish': 'No Garage',
            'Garage Qual': 'No Garage',
            'Garage Cond': 'No Garage',
            'Garage Type': 'No Garage',
            'Bsmt Exposure':'No Garage',
            'BsmtFin Type 2':'No Basement',
            'BsmtFin Type 1':'No Basement',
            'Bsmt Cond':'No Basement',
            'Bsmt Qual':'No Basement',
            'Mas Vnr Type':'No Mas Vnr'        
        }
        
        cleaned_data = cleaned_data.fillna(fillna_dict)
        
        return(cleaned_data)
        
    X_train = preprocessing(X_train)
    X_test  = preprocessing(X_test)
    kaggl_data = preprocessing(kaggl_data)

    # Grab the string columns:
    string_cols = X_train.select_dtypes(exclude=[np.number]).columns

    # Get some dummies:
    X_train = pd.get_dummies(X_train, columns=string_cols)
    X_test = pd.get_dummies(X_test, columns=string_cols)
    kaggl_data = pd.get_dummies(kaggl_data, columns=string_cols)

    # Addressing Column Mismatch After Dummifying
    print('Addressing column mismatch...')

    # Add columns of zeros to test and kaggle sets for columns that *do* appear in
    # the training set.

    model_cols = X_train.columns

    def add_model_cols(data, model_cols):
        new_data = data.copy()
        for missing_col in [col for col in model_cols if col not in data.columns]:
            new_data[missing_col] = 0
        return new_data

    X_test = add_model_cols(X_test, model_cols=model_cols)
    kaggl_data = add_model_cols(kaggl_data, model_cols=model_cols)

    # Now, let's only consider columns in X_test and kaggl_data that appear in
    # the training set. We'll call these 'model columns':

    kaggl_data = kaggl_data[model_cols]
    X_test     = X_test[model_cols]

    # Make sure we've done this correctly:
    assert X_train.shape[1] == X_test.shape[1] == kaggl_data.shape[1]
    assert X_train.columns.all() == X_test.columns.all()== kaggl_data.columns.all() 

    # Imputing Numerical Missing Data: Handling Numerical Data
    print('Imputing missing numerical data...')

    imp = Imputer(strategy='mean')
    imp.fit(X_train)
    X_train = imp.transform(X_train)
    X_test  = imp.transform(X_test)
    kaggl_data = imp.transform(kaggl_data)

    def array_null_check(array):
        '''Turns an array into a dataframe so that we can check for null values'''
        return pd.DataFrame(array).isnull().sum().sum()

    assert array_null_check(X_train) == array_null_check(X_test)                                  == array_null_check(kaggl_data)

    # Brute Force Feature Engineering

    if brute:
        print('Brute force feature engineering...')
        pf = PolynomialFeatures(interaction_only=interaction_only)
        X_train = pf.fit_transform(X_train)
        X_test  = pf.transform(X_test)
        kaggl_data = pf.transform(kaggl_data)

    # Maybe this is too many columns???
    print('X_train has:\n---{} rows\n---{} columns'.format(X_train.shape[0], X_train.shape[1]))

    # Scaling
    print('Scaling all columns...')

    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test  = ss.transform(X_test)
    kaggl_data = ss.transform(kaggl_data)

    # Feature Elimination


    if brute:
        print('Performing automatic feature elimination')
        # Only do feature elimination if feature engineering happened by brute force
        feature_variances = np.apply_along_axis(np.var, axis=0, arr= X_train)

        # Define a percentile threshold. Do I want the top 1% of features by variance?
        perc_thresh = np.percentile(feature_variances, 99)
        perc_thresh

        vt = VarianceThreshold(threshold=perc_thresh)
        X_train_reduced = vt.fit_transform(X_train)
        X_test_reduced  = vt.transform(X_test)
        kaggl_reduced   = vt.transform(kaggl_data)
        print('X_train now has:\n---{} rows\n---{} columns'.format(X_train.shape[0], X_train.shape[1]))
    else:
        X_train_reduced = X_train
        X_test_reduced  = X_test
        kaggl_reduced   = kaggl_data

    # Or do I want to select the top 1% of features according 
    # to the f_regression function?

    # sp = SelectPercentile(score_func=f_regression, percentile = 1)
    # X_train_reduced = sp.fit_transform(X_train, y_train)
    # X_test_reduced  = sp.transform(X_test)
    # kaggl_reduced   = sp.transform(kaggl_data)
    # print(X_train.shape[1])

    ## Modeling

    # Linear Regression

    if run_lin:
        lin = LinearRegression()
        lin.fit(X_train_reduced, y_train)
        cv_scores = cross_val_score(lin, X_train_reduced, y_train, cv=3).mean()

        print('{} model has average performance of {}'
            .format(str(lin).split('(')[0], cv_scores.mean()))

    # Ridge Regression

    if run_ridge:
        rid = RidgeCV()
        rid.fit(X_train_reduced, y_train)
        cv_scores = cross_val_score(rid, X_train_reduced, y_train, cv=3).mean()

        print('{} model has average performance of {}'
            .format(str(rid).split('(')[0], cv_scores.mean()))

    # Lasso Regression

    if run_las:
        # Define a reasonable range of alphas based on previous LASSO fits:
        alphas = np.logspace(2,4,20)
        las = LassoCV(alphas=alphas, n_jobs=-1)
        las.fit(X_train_reduced, y_train)
        cv_scores = cross_val_score(las, X_train_reduced, y_train, cv=3).mean()
        best_alpha = las.alpha_
        print('{} model has average performance of {}'
            .format(str(las).split('(')[0], cv_scores.mean()))

    las = Lasso(alpha=best_alpha, max_iter=2000)
    cv_scores = cross_val_score(las, X_train_reduced, y_train, cv=3).mean()
    las.fit(X_train_reduced, y_train)
    print('{} model has average performance of {}'
        .format(str(las).split('(')[0], cv_scores.mean()))

    # ElasticNet Regression

    if run_elnet:
        elnet = ElasticNetCV(n_alphas=10)
        elnet.fit(X_train_reduced, y_train)
        cv_scores = cross_val_score(elnet, X_train_reduced, y_train, cv=3).mean()

        print('{} model has average performance of {}'
            .format(str(elnet).split('(')[0], cv_scores.mean()))

    # Final Model Test

    models = {}

    try:
        lin_score = lin.score(X_test_reduced, y_test)
        models[lin_score] = lin
        print('Test set performance of {}: {}'.format(str(lin).split('(')[0],lin_score))
    except:
        pass    

    try:
        rid_score = rid.score(X_test_reduced, y_test)
        models[rid_score] = rid
        print('Test set performance of {}: {}'.format(str(rid).split('(')[0],rid_score))
    except:
        pass    

    try:
        las_score = las.score(X_test_reduced, y_test)
        models[las_score] = las
        print('Test set performance of {}: {}'.format(str(las).split('(')[0],las_score))
    except:
        pass          

    try:
        elnet_score = elnet.score(X_test_reduced, y_test)
        models[elnet_score] = elnet
        print('Test set performance of {}: {}'.format(str(elnet).split('(')[0],elnet_score))
    except:
        pass   

    high_score = max(models.keys())
    print('Best performing model was {},\nwith test set performance of {}'.format(
        str(models[high_score]).split('(')[0], round(high_score,5)))

    # Choosing a Model and Outputting Submission:

    # Choose a model based on test set performance:
    chosen_model = models[high_score]

    if submission_path:

        kaggl_preds = chosen_model.predict(kaggl_reduced)

        kaggl_id = pd.read_csv('data/test.csv')['Id']

        sample_submission = pd.read_csv('data/sample_submission.csv')
        submission_columns= sample_submission.columns

        submission = pd.DataFrame({submission_columns[0]:kaggl_id,
                                submission_columns[1]:kaggl_preds})

        submission.to_csv(submission_path, index=False) 
Exemplo n.º 30
0
def main():

    df = joblib.load('modelDataset.pkl')

    # Split dataframe into features and target
    y = df.iloc[:, 1]  # .as_matrix()
    X = df.iloc[:, 2:]  # .as_matrix()
    id = df.iloc[:, 0]

    # Scalings
    sc = StandardScaler()

    # Apply scaler
    colNames = X.columns
    X = sc.fit_transform(X)
    X = pd.DataFrame(X, columns=colNames)

    # Remove features with less than 20% variance
    colNames = X.columns
    sel = VarianceThreshold(threshold=0.16)
    X = sel.fit_transform(X)
    # Get column names back
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Perform univariate feature selection (ANOVA F-values)
    colNames = X.columns
    selection_Percent = SelectPercentile(percentile=5)
    X = selection_Percent.fit_transform(X, y)
    # Get column names back
    newCols = []
    for remain, col in zip(selection_Percent.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Perform tree-based feature selection
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    colNames = X.columns
    sel = SelectFromModel(clf, prefit=True)
    X = sel.transform(X)
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1555)

    def testRegressor(clf):
        '''
        #RF grid
        param_grid = [{'n_estimators': range(320, 350, 10),
                       'min_samples_split': range(2, 20, 2),
                       'min_samples_leaf': range(2, 20, 2),
                       'max_leaf_nodes': range(140, 170, 5)
                       }]
        grid = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1)
        fitted_classifier = grid.fit(X_train, y_train)
        print(grid.best_score_, grid.best_params_)
        predictions = fitted_classifier.predict(X_train)'''
        '''
        #XGB tuning - concept, not in use
        param_grid = [{'max_depth': range(2, 4, 1),
                       'min_child_weight': range(3, 6, 1),
                       'n_estimators': range(80, 110, 10),
                       'learning_rate': [0.1],
                       'gamma': [0],
                       'subsample': [0.9, 1],
                       'colsample_bytree': [0.7],
                       'reg_alpha': [15, 50, 100, 150, 200],
                       'reg_lambda': [15, 20, 25, 30, 40, 50]}]
        fit_params = {"early_stopping_rounds": 8,
                      "eval_metric": "mae",
                      "eval_set": [[X_test, y_test]],
                      "verbose": False}
        grid = GridSearchCV(clf, param_grid, fit_params=fit_params,
                            cv=3, verbose=1, n_jobs=-1)
        fitted_classifier = grid.fit(X_train, y_train)
        print(grid.best_score_, grid.best_params_)
        predictions = fitted_classifier.predict(X_train)
        '''

        fitted = clf.fit(X_train, y_train)
        scoresCV = cross_val_score(clf,
                                   X_train,
                                   y_train,
                                   cv=3,
                                   verbose=0,
                                   n_jobs=-1)
        trainPredictionsCV = cross_val_predict(clf,
                                               X_train,
                                               y_train,
                                               cv=3,
                                               verbose=0,
                                               n_jobs=-1)

        trainPredictions = clf.predict(X_train)
        testPredictions = clf.predict(X_test)

        score1 = metrics.explained_variance_score(y_test.values,
                                                  testPredictions)
        score2 = metrics.mean_absolute_error(y_test.values, testPredictions)
        score3 = metrics.mean_squared_error(y_test.values, testPredictions)
        score4 = metrics.r2_score(y_test.values, testPredictions)
        print('Train score: ',
              metrics.mean_absolute_error(y_train.values, trainPredictions))
        print('CV score: ', scoresCV)
        print('Explained Variance Score, MAE, MSE, R^2')
        print(score1, score2, score3, score4)

        tempIndex = range(0, len(y_test.values), 1)
        plt.scatter(tempIndex, y_test.values, color='black', s=20, alpha=0.8)
        plt.scatter(tempIndex, testPredictions, color='red', s=20, alpha=0.4)
        plt.show()
        #Results appear to be highly interesting
        #MSE (and thus penalising large errors more) suggests that the model does not deal well with
        #particular categories of retweets where there is a significant difference between true value and predicted
        #Data appears to have high bias in terms of selection, as if tweets were selected from specific pools
        #based on retweet value
        #While the random forest deals well with those particular types of tweets, more analysis is needed
        # Further steps would start by understanding the sampling procedure that produced these tweets
        # From there, features need to be relooked at, dimensionality reduction (such as PCA) might be needed
        # Simpler / more powerful models to then be appropriately applied
        #The target retweets actually seem to be created from a Decision Tree Model
        print('x')

    lr = LinearRegression()
    dt = DecisionTreeRegressor()
    rf = RandomForestRegressor()
    gb = xgboost.XGBRegressor()

    #print('LR')
    #testRegressor(lr)
    #print('DT')
    #testRegressor(dt)
    print('RF')
    testRegressor(dt)
Exemplo n.º 31
0
def learn(X: pd.DataFrame, y: pd.DataFrame, s: pd.DataFrame, outer_folds: list, 
          inner_folds: list) -> pd.DataFrame:
    """Apply the entire machine learning procedure.
    
    Arguments: 
    - X: A m*n dataframe containing features, that is used as input for 
        classifier
    - y: A boolean vector of length n, containing the targets
    - s: A boolean vector of length n, indicating whether a sample belongs to 
        sensitive group.
    - outer_folds, inner_folds: Result of src.get_folds.
        
    Returns a pd.DataFrame containing the performance over all folds.
    """
    assert all(X.index == y.index)
    assert all(X.index == s.index)
    
    # Convert X, y, s to np.arrays for compatibility reasons.
    X = np.ascontiguousarray(X.values)
    y = np.ascontiguousarray(y.values.ravel()) > 1
    s = np.ascontiguousarray(s.values.ravel())
    
    params = [
        (int(max_depth), int(n_bins), float(orthogonality))
        for n_bins in (2,)
        for max_depth in np.arange(1, 11) 
        for orthogonality in np.linspace(0, 1, 11)
    ]
    
    # Learn on every outer fold
    iterations = [
        (max_depth, n_bins, ortho, fold, trainval_idx, test_idx)
        for max_depth, n_bins, ortho in params
        for fold, (trainval_idx, test_idx) in outer_folds
        if not isfile(f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.pkl')
    ]
    
    for max_depth, n_bins, ortho, fold, trainval_idx, test_idx in tqdm(iterations):
        X_trainval = X[trainval_idx]
        y_trainval = y[trainval_idx]
        s_trainval = s[trainval_idx]
        
        vt = VarianceThreshold()
        vt.fit(X_trainval)
        X_trainval = vt.transform(X_trainval)
        
        clf = FairRandomForestClassifier(
            orthogonality=ortho, max_depth=max_depth, n_bins=n_bins)
        start_fit = time()
        clf.fit(X_trainval, y_trainval, s_trainval)
        clf.fit_time = time() - start_fit
        fp = f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.pkl'
        joblib.dump(clf, fp)
        
    # Learn on every inner fold
    iterations = [
        (max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx)
        for max_depth, n_bins, ortho in params
        for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds
        if not isfile(f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.pkl')
    ]
    for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations):    
        X_train = X[train_idx]
        y_train = y[train_idx]
        s_train = s[train_idx]
        vt = VarianceThreshold()
        vt.fit(X_train)
        X_train = vt.transform(X_train)
        clf = FairRandomForestClassifier(
            orthogonality=ortho, max_depth=max_depth, n_bins=n_bins)
        start_fit = time()
        clf.fit(X_train, y_train, s_train)
        clf.fit_time = time() - start_fit
        fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.pkl'
        joblib.dump(clf, fp)
        
    # Predict on all outer folds
    iterations = [
        (max_depth, n_bins, ortho, fold, trainval_idx, test_idx)
        for max_depth, n_bins, ortho in params
        for fold, (trainval_idx, test_idx) in outer_folds
        if not isfile(f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}.npy')
    ]

    for max_depth, n_bins, ortho, fold, trainval_idx, test_idx in tqdm(iterations):        
        X_trainval = X[trainval_idx]
        X_test = X[test_idx]
        
        vt = VarianceThreshold()
        vt.fit(X_trainval)
        X_trainval = vt.transform(X_trainval)
        X_test = vt.transform(X_test)
        
        fp = f'models/outer_folds/{max_depth}-{ortho:.2f}-{n_bins}-{fold}'
        clf = joblib.load(f'{fp}.pkl')
        y_score = clf.predict_proba(X_test)[:,1]
        np.save(f'{fp}.npy', y_score)
    
    # Predict on all inner folds
    iterations = [
        (max_depth, n_bins, ortho, outer_fold, inner_fold, 
        train_idx, val_idx)
        for max_depth, n_bins, ortho in params
        for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds
        if not isfile(f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}.npy')
    ]
    for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations):
        
        X_train = X[train_idx]
        X_val = X[val_idx]
        
        vt = VarianceThreshold()
        vt.fit(X_train)
        X_train = vt.transform(X_train)
        X_val = vt.transform(X_val)
        
        fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}'
        clf = joblib.load(f'{fp}.pkl')
        y_score = clf.predict_proba(X_val)[:,1]
        np.save(f'{fp}.npy', y_score)
    
    # Measure performance for every outer loop
    iterations = [
        (max_depth, n_bins, orthogonality, outer_fold, inner_fold, 
        train_idx, val_idx)
        for max_depth, n_bins, orthogonality in params
        for (outer_fold, inner_fold), (train_idx, val_idx) in inner_folds
    ]

    performance_all_candidates = list()
    for max_depth, n_bins, ortho, outer_fold, inner_fold, train_idx, val_idx in tqdm(iterations):
        fp = f'models/inner_folds/{max_depth}-{ortho:.2f}-{n_bins}-{outer_fold}-{inner_fold}'
        y_score = np.load(f'{fp}.npy')
        
        y_val = y[val_idx]
        s_val = s[val_idx]
        auc_y = roc_auc_score(y_val, y_score)
        auc_s = roc_auc_score(s_val, y_score)
        auc_s = max(auc_s, 1-auc_s)
        
        performance_this_run = dict(
            max_depth=max_depth, n_bins=n_bins, orthogonality=ortho,
            outer_fold=outer_fold, inner_fold=inner_fold, auc_y=auc_y, 
            auc_s=auc_s)
    performance_all_candidates.append(performance_this_run)
    return pd.DataFrame(performance_all_candidates)
Exemplo n.º 32
0
#then we know that for real those are the features not helpful


xtrain_aud = sio.loadmat('xtrain_all_aud.mat')
xtrain_aud = xtrain_aud['xtrain']
ytrain_aud = sio.loadmat('ytrain_all_aud.mat')
ytrain_aud = ytrain_aud['ytrain']

# method 1: variance threshold

Var_selector = VarThresh(.5)
# without any parameters passed to varthresh it defaults to anything with all feautres the exact same
#  am going to start with .1
Var_selector.fit(xtrain_aud)
which_feats = Var_selector.get_support()
x_aud_fitted = Var_selector.transform(xtrain_aud)

print x_aud_fitted.shape


xtrunclength = sio.loadmat('xtrunclength.mat')
xtrunclength = xtrunclength['xtrunclength']

xtesting = sio.loadmat('xtesting.mat')
xtesting = xtesting['xtesting']

xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:]
xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:]

from CurrentThingsNeededtoRun import FinalClassifier
class TextClassifier(BaseEstimator):

    def __init__(self, base_classifiers = [SGDClassifier()]):
        """
        Parameters
        ----------
            base_classifiers: array, shape = [n_estimators], optional, default: [SGDClassifier()]
                estimators objects implementing fit and predict
                used for classification, the best combination is choosen

        Attributes
        ----------
            multilabel_: boolean, optional, default: True
            with_titles_: boolean, optional, default: False

        """
        self.base_classifiers = base_classifiers

    def __feature_selection(self, text_data):
        """ 

        Parameters
        ----------
            text_data: array, shape = [n_samples]

        Returns
        -------
            sparse matrix of text features
        """
        X = self.count_vect_.fit_transform(text_data)
        X_tfidf = self.tfidf_transformer_.fit_transform(X)
        return X_tfidf

    def __transform_features(self, text_data):
        """
        Transform data by using tf-idf

        Parameters
        ----------

        Returns
        -------
        """
        X = self.count_vect_.transform(text_data)
        X_tfidf = self.tfidf_transformer_.transform(X)
        return X_tfidf

    def fit(self, X, y, titles = None, multilabel = True):
        """
        Fit base_classifiers, choose the best model

        Parameters
        ----------
            X: array, shape = [n_samples]
            y: array, shape = [n_samples]
            titles: array, shape = [n_samples], optional, default: None
            multilabel: boolean, optional, default:True

        Returns
        -------
        self
        """
        self.with_titles_ = (titles != None)
        self.multilabel_ = multilabel
        self.tfidf_transformer_ = TfidfTransformer()
        self.count_vect_ = CountVectorizer(decode_error='ignore')
        self.best_classifier_ = self.base_classifiers[0]
        best_quality = 0.0
        if (self.with_titles_):
            X_train = [X[i] + ' ' + titles[i] for i in range(len(X))]
        else:
            X_train = X
        X_features = self.__feature_selection(X_train)
        if (self.multilabel_):
            """
            remove target features, that are equal in all objects
            """
            self.selector_ = VarianceThreshold()
            Y = self.selector_.fit_transform(y)
            self.best_classifier_ = OneVsRestClassifier(self.best_classifier_)
        else: 
            for classifier in self.base_classifiers:
                new_quality = np.mean(cross_val_score(classifier, X_features, np.array(y)))
                if (new_quality > best_quality):
                    best_quality = new_quality
                    self.best_classifier_ = classifier
            Y = y
        self.best_classifier_.fit(X_features, Y)
        return self

    def predict(self, X, titles = None):
        """
        Parameters
        ----------
            X: array, shape = [n_samples]
            titles: array, shape = [n_samples], optional, default: None

        Returns
        -------
            y_pred: array, shape = [n_samples]
        """
        self.with_titles = (titles != None)
        if (self.with_titles_):
            X_train = [X[i] + ' ' + titles[i] for i in range(len(X))]
        else:
            X_train = X
        X_features = self.__transform_features(X_train)
        y_pred = self.best_classifier_.predict(X_features)
        return y_pred

    def predict_proba(self, X):
        """
        Compute probabilities of possible outcomes for samples in X.

        Parameters
        ----------
            X: array, shape = [n_samples]
        Returns
        -------
            Returns the probability of the sample for each class in the model. 
            The columns correspond to the classes in sorted order, as they appear in the attribute classes_.
        """
        X_features = self.__transform_features(X)
        return self.best_classifier_.predict_proba(X_features)

    def get_support(self):
        """
        Get a mask, or integer index, of the features selected

        Returns
        -------
            T: array, shape = [n_features]
            returns the mask of selected features
        """
        return self.selector_.get_support()    

    def score(self, X, y_true):
        """
        Parameters
        ----------
            X: array, shape = [n_samples]
            y_true: true labels for X
        
        Returns
            Mean accuracy of self.predict(X) wrt. y.
        -------
        """
        if (self.multilabel_):
            Y = self.selector_.transform(y_true)
            return np.mean(Y == self.predict(X))
        else:
            return accuracy_score(Y, self.predict(X))

    def load(self, path):
        """ 
        Load model parameters from path

        Parameters
        ----------
            path: path to load from
        -------

        """
        file = open(path, 'rb')
        sys.modules['textclassifier'] = sys.modules[__name__]
        state = pickle.load(file)
        self.__dict__ = state.__dict__
        file.close()
Exemplo n.º 34
0
	def fit(self, x, y, w=None):
		""" Fits upper and lower bounds on p(y|x)
		Args:
			x, y are lists with control groups first
		"""

		# -----preprocessing
		if self.standardize:

			x0selector = VarianceThreshold(threshold=.1).fit(x[0])
			temp_x0 = x0selector.transform(x[0])
			x0scaler = StandardScaler().fit(temp_x0)
			self.x0scaler = lambda x: x0scaler.transform(x0selector.transform(x))

			x1selector = VarianceThreshold(threshold=.1).fit(x[1])
			temp_x1 = x1selector.transform(x[1])
			x1scaler = StandardScaler().fit(temp_x1)
			self.x1scaler = lambda x: x1scaler.transform(x1selector.transform(x))

			x00 = self.x0scaler(x[0])
			x01 = self.x1scaler(x[0])
			x11 = self.x1scaler(x[1])
			x10 = self.x0scaler(x[1])

		else:
			x00, x01 = x[0], x[0]
			x11, x10 = x[1], x[1]

		if self.kernel == 'linear':
			self.kernel_fit = lambda x: x
			x00 = self.kernel_fit(x00)
			x01 = self.kernel_fit(x01)
			x11 = self.kernel_fit(x11)
			x10 = self.kernel_fit(x10)

		elif self.kernel == 'poly':
			if self.p is None:
				raise ValueError('Need polynomial value')

			self.kernel_fit = lambda x: np.hstack([x**i for i in range(1, self.p + 1)])
			x00 = self.kernel_fit(x00)
			x01 = self.kernel_fit(x01)
			x11 = self.kernel_fit(x11)
			x10 = self.kernel_fit(x10)

		elif self.kernel == 'rbf':
			if self.sig is None:
				raise ValueError('Need Length scale value')
			self.x0_tr = x00.copy()
			self.x1_tr = x11.copy()

			self.kernel_fit = lambda x, tg: RBF(length_scale=self.sig).__call__(x,
				self.x1_tr) if tg == 1 else \
				RBF(length_scale=self.sig).__call__(x, self.x0_tr)

			x00 = self.kernel_fit(x00, tg=0)
			x01 = self.kernel_fit(x01, tg=1)
			x11 = self.kernel_fit(x11, tg=1)
			x10 = self.kernel_fit(x10, tg=0)

		elif self.kernel == 'rbf_approx':
			if self.sig is None:
				raise ValueError('Need Length scale value')

			self.x0_tr = x00.copy()
			self.x1_tr = x11.copy()

			self.rbf_approx1 = RBFSampler(gamma=1 / self.sig, n_components=100,
				random_state=0).fit(self.x1_tr)
			self.rbf_approx0 = RBFSampler(gamma=1 / self.sig, n_components=100,
				random_state=0).fit(self.x0_tr)

			self.kernel_fit = lambda x, tg: self.rbf_approx1.transform(x) if tg == 1 else \
				self.rbf_approx0.transform(x)

			x00 = self.kernel_fit(x00, tg=0)
			x01 = self.kernel_fit(x01, tg=1)
			x11 = self.kernel_fit(x11, tg=1)
			x10 = self.kernel_fit(x10, tg=0)

		n1, d1 = x11.shape[0], x11.shape[1]
		n0, d0 = x00.shape[0], x00.shape[1]
		y0 = y[0]
		y1 = y[1]

		n = n1 + n0

		mdl = grb.Model("cqp")
		mdl.ModelSense = 1
		mdl.setParam('OutputFlag', False)
		mdl.reset()
		L = 1e5

		u0 = [mdl.addVar(name="u0_%d" % i, lb=-L, ub=L) for i in range(n)]
		l0 = [mdl.addVar(name="l0_%d" % i, lb=-L, ub=L) for i in range(n)]

		u1 = [mdl.addVar(name="u1_%d" % i, lb=-L, ub=L) for i in range(n)]
		l1 = [mdl.addVar(name="l1_%d" % i, lb=-L, ub=L) for i in range(n)]

		bU0 = [mdl.addVar(name="bu0_%d" % i, lb=-L, ub=L) for i in range(d0 + 1)]
		bL0 = [mdl.addVar(name="bl0_%d" % i, lb=-L, ub=L) for i in range(d0 + 1)]

		bU1 = [mdl.addVar(name="bu1_%d" % i, lb=-L, ub=L) for i in range(d1 + 1)]
		bL1 = [mdl.addVar(name="bl1_%d" % i, lb=-L, ub=L) for i in range(d1 + 1)]

		rUs = [mdl.addVar(name="ru%d" % i, lb=0, ub=L) for i in range(n)]
		rLs = [mdl.addVar(name="rl%d" % i, lb=0, ub=L) for i in range(n)]

		slackU1 = 0
		slackL1 = 0

		slackU0 = 0
		slackL0 = 0

		if w is None:
			w0, w1= np.ones(n0) / n0, np.ones(n1) / n1
		else:
			w0 = w[0]
			w1 = w[1]

		obj_terms = []
		for i in range(n):
			mdl.addConstr(u1[i] >= l1[i])
			mdl.addConstr(u0[i] >= l0[i])

		for i in range(n0):

			mdl.addConstr(u1[i] == np.dot(x01[i, ], bU1[:d1]) + bU1[-1])
			mdl.addConstr(l1[i] == np.dot(x01[i, ], bL1[:d1]) + bL1[-1])

			mdl.addConstr(u0[i] == np.dot(x00[i, ], bU0[:d0]) + bU0[-1])
			mdl.addConstr(l0[i] == np.dot(x00[i, ], bL0[:d0]) + bL0[-1])

			mdl.addConstr(rUs[i] >= y0[i] - u0[i])
			mdl.addConstr(rLs[i] >= l0[i] - y0[i])

			slackU0 += w0[i] * rUs[i]
			slackL0 += w0[i] * rLs[i]

			if self.loss == 'square':
				obj_terms.append(w0[i] * ((u0[i] - l0[i]) * (u0[i] - l0[i]) + (u1[i] - l1[i]) * (u1[i] - l1[i])))
			elif self.loss == 'linear':
				if self.agg == "max":
					obj_terms.append(((u0[i] - l0[i]) + (u1[i] - l1[i])))
				else:
					obj_terms.append(w0[i]*((u0[i] - l0[i])+ (u1[i] - l1[i])))
			else:
				raise Exception('Unrecognized loss: %s' % self.loss)

		for i in range(n0, n1+n0):

			mdl.addConstr(u1[i] == np.dot(x11[i - n0, ], bU1[:d1]) + bU1[-1])
			mdl.addConstr(l1[i] == np.dot(x11[i - n0, ], bL1[:d1]) + bL1[-1])

			mdl.addConstr(u0[i] == np.dot(x10[i - n0, ], bU0[:d0]) + bU0[-1])
			mdl.addConstr(l0[i] == np.dot(x10[i - n0, ], bL0[:d0]) + bL0[-1])

			mdl.addConstr(rUs[i] >= y1[i - n0] - u1[i])
			mdl.addConstr(rLs[i] >= l1[i] - y1[i - n0])

			slackU1 += w1[i - n0] * rUs[i]
			slackL1 += w1[i - n0] * rLs[i]

			if self.loss == 'square':
				obj_terms.append(w1[i - n0] * ((u1[i] - l1[i]) * (u1[i] - l1[i])))

			elif self.loss == 'linear':
				if self.agg == "max":
					obj_terms.append(((u1[i] - l1[i]) + (u0[i] - l0[i])))
				else:
					obj_terms.append(w1[i - n0] * ((u1[i] - l1[i]) + (u0[i] - l0[i])))

			else:
				raise Exception('Unrecognized loss: %s' % self.loss)

		if self.agg == 'max':
			o = mdl.addVar(name="o", lb=-L, ub=L)
			os = []
			for i in range(n):
				oi = mdl.addVar(name="o%d" % i, lb=-L, ub=L)
				mdl.addConstr(oi == obj_terms[i])
				os += [oi]
			mdl.addConstr(o == grb.max_(os))
			obj = o# + .01*grb.quicksum(obj_terms)
		else:
			obj = grb.quicksum(obj_terms)

		obj_reg_u0, obj_reg_l0, obj_reg_u1, obj_reg_l1 = 0, 0, 0, 0

		for k in range(d1):
			obj_reg_u1 += bU1[k] * bU1[k]
			obj_reg_l1 += bL1[k] * bL1[k]

		for k in range(d0):
			obj_reg_u0 += bU0[k] * bU0[k]
			obj_reg_l0 += bL0[k] * bL0[k]

		obj_reg = self.alphau1 * obj_reg_u1 + self.alphal1 * obj_reg_l1 + \
			self.alphau0 * obj_reg_u0 + self.alphal0 * obj_reg_l0

		obj = obj + obj_reg

		mdl.addConstr((slackU0 <= self.lamdau0))
		mdl.addConstr((slackL0 <= self.lamdal0))

		mdl.addConstr((slackU1 <= self.lamdau1))
		mdl.addConstr((slackL1 <= self.lamdal1))

		mdl.setObjective(obj)
		mdl.optimize()

		self.bu0 = np.array([bU0[j].x for j in range(d0 + 1)])
		self.bl0 = np.array([bL0[j].x for j in range(d0 + 1)])

		self.bu1 = np.array([bU1[j].x for j in range(d1 + 1)])
		self.bl1 = np.array([bL1[j].x for j in range(d1 + 1)])

		return self
Exemplo n.º 35
0
def preProcessData(trainFeatureMatrix, testFeatureMatrix):
	totalFeatureNum = 52
	singleValueIndexList = [17, 19, 20, 23]
	categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46]	
	categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090]
	cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51]

	numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51]
	
	# for i in range(len(trainFeatureSpace[0])):
	# 	if not i in categoricalAttriIndexList:
	# 		#print 'numerical', i, len(list(set(trainFeatureSpace[:,i])))
	# 		print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	# 	else:
	# 		print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	


	tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)

	# print len(trainFeatureMatrix), len(trainFeatureMatrix[0])
	# print len(testFeatureMatrix), len(testFeatureMatrix[0])
	# print len(tempResultMatrix), len(tempResultMatrix[0])

	# exit()

	# for i in range(len(trainFeatureMatrix)):
	# 	for j in range(len(trainFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j])

	# for i in range(len(testFeatureMatrix)):
	# 	for j in range(len(testFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j])

	#selectedFeatureList = []
	# for i in range(53):
	# 	if not i in singleValueIndexList:
	# 		selectedFeatureList.append(i)

	# trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList]
	# testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList]
	from sklearn.preprocessing import OneHotEncoder
	enc = OneHotEncoder()
	enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList)
	enc.fit(tempResultMatrix)
	trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray()
	testFeatureMatrix = enc.transform(testFeatureMatrix).toarray()

	print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])


	#tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)


	sel = VarianceThreshold()
	sel.fit(trainFeatureMatrix)
	trainFeatureMatrix = sel.transform(trainFeatureMatrix)
	testFeatureMatrix = sel.transform(testFeatureMatrix)
	print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])
	#exit()
	return trainFeatureMatrix, testFeatureMatrix
Exemplo n.º 36
0
    #vtFT = VarianceThreshold(threshold=(0.2))
    vtFT = VarianceThreshold(0.00025)

    print(trainFts.shape)

    trainFts = vtFT.fit_transform(trainFts)

    print(vtFT.variances_)
    print(min(vtFT.variances_))
    print(max(vtFT.variances_))

    _, ax = plt.subplots()
    bins = np.linspace(0.00030, 0.008, 1000)
    ax.hist(vtFT.variances_, bins)

    devFts = vtFT.transform(devFts)

    goldFts = vtFT.transform(goldFts)

    print(trainFts.shape)

if DR_SVD_ON:
    svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
    svd.fit(trainFts)
    #print(svd.explained_variance_ratio_)

    print(trainFts.shape)
    #print(type(trainFts))
    trainFts = sp.csr_matrix(svd.transform(trainFts))
    devFts = sp.csr_matrix(svd.transform(devFts))
    goldFts = sp.csr_matrix(svd.transform(goldFts))
elimintated_feats = list(pool_features - selected_features)[:-1]

# In[ ]:

print("Elimintated features:", elimintated_feats)

# In[ ]:

len(elimintated_feats)

# We eliminated 14 features. Do the same with TEST subset:

# In[ ]:

X_high_variance_ts = featFilter.transform(X_ts)

# In[ ]:

X_high_variance_ts.shape

# In[ ]:

X_high_variance.shape

# In[ ]:

df_selVar = df[df.columns[featFilter.get_support(indices=True)]]

# Add Y:
Exemplo n.º 38
0
class Model:

    # model related attribute
    model = None
    feat_transformer =None
    label_encoder = None
    selector = None
    pipeline = None

    # other parameters
    classification = True
    binarize = False;
    verbose = False

    # params for cloning
    params = dict()

    def __init__(self, model_type=None, \
                    model_params="", \
                    f_select=None, \
                    f_select_params="", \
                    sparse=True, \
                    n_features=100, \
                    n_components=10):
        self.params = {"model_type": model_type, "model_params": model_params, "f_select": f_select, "f_select_params": f_select_params, "sparse": sparse, "n_features": n_features, "n_components": n_components}

        if (model_type == None):
            return

        # initialize "default" values
        # models specific values should be set in model-specific if-else branch
        self.feat_transformer = DictVectorizer(sparse=sparse)
        self.label_encoder = LabelEncoder()

        # selector to remove zero-variance features
        self.var_selector = VarianceThreshold()

        #scaler = StandardScaler(with_mean=False)
        #selector = SelectKBest(chi2, k=n_features)
        #combined_features = FeatureUnion([('selector', selector)])
        #self.pipeline = Pipeline([('vectorizer', feat_vectorizer), ('features', combined_features), ('scaler', scaler), ('model', self.model)])

        # Choose model type
        if (model_type == "linear_svm"):
            self.model = eval("LinearSVC(" + model_params + ")")
        elif (model_type == "svm"):
            self.model = eval("SVC(" + model_params + ")")
        elif (model_type == "knn"):
            self.model = eval("KNeighborsClassifier(" + model_params + ")")
        elif (model_type == "ridge_classifier"):
            self.model = eval("RidgeClassifier(" + model_params + ")")
        elif (model_type == "ridge_regression"):
            self.classification = False
            self.model = eval("Ridge(" + model_params + ")")
        elif (model_type == "lasso"):
            self.classification = False
            self.model = eval("Lasso(" + model_params + ")")
        elif (model_type == "bayesian_ridge"):
            self.classification = False
            self.model = eval("BayesianRidge(" + model_params + ")")
        elif (model_type == "gaussian_bayes"):
            self.model = GaussianNB()
        elif (model_type == "decision_trees"):
            self.model = DecisionTreeClassifier(random_state=0)
        elif (model_type == "log_regression"):
            self.model = eval("LogisticRegression(" + model_params + ")")
        elif (model_type == "linear_regression"):
            self.classification = False
            self.model = eval("LinearRegression(" + model_params + ")")
        elif (model_type == "perceptron"):
            self.model = eval("Perceptron(" + model_params + ")")
        elif (model_type == "extra_trees"):
            self.feat_transformer = DictVectorizer(sparse=False)
            self.model = eval("ExtraTreesClassifier(" + model_params + ")")
        elif (model_type == "random_forest"):
            self.feat_transformer = DictVectorizer(sparse=False)
            self.model = eval("RandomForestClassifier(" + model_params + ")")
        elif (model_type == "ada_boost"):
#            self.feat_transformer = DictVectorizer(sparse=False)
            self.model = eval("AdaBoostClassifier(" + model_params + ")")
        elif (model_type == "sgd_classifier"):
            self.model = eval("SGDClassifier(" + model_params + ")")
        elif (model_type == "baseline"):
            self.model = eval("DummyClassifier(" + model_params + ")")
        else:
            print >> sys.stderr, "Model of type " + model_type + " is not supported."

        # Choose feature selector
        if (f_select == None):
            self.selector = EmptyModel()
        elif (f_select == "kbest"):
            # params: score_func, k
            self.selector = eval("SelectKBest(" + f_select_params + ")")
        elif (f_select == "percentile"):
            self.selector = eval("SelectPercentile(" + f_select_params + ")")
        elif (f_select == "kbest_anova"):
            #self.selector = SelectKBest(f_classif, k=n)
            self.selector = eval("SelectKBest(" + "f_classif," + f_select_params + ")")
        elif (f_select == "lassocv"):
            self.selector = eval("SelectFromModel(LassoCV()," + f_select_params + ")")
        elif (f_select == "rfe_svm"):
            self.selector = eval("RFE(LinearSVC()," + f_select_params + ")")
        elif (f_select == "rfecv"):
            self.selector = eval("RFECV(" + f_select_params + ")")
        elif (f_select == "rlregr"):
            self.selector = eval("RandomizedLogisticRegression(" + f_select_params + ")")
        elif (f_select == "svm"):
            print "SelectFromModel(LinearSVC(" + f_select_params + "))"
            self.selector = eval("SelectFromModel(LinearSVC(" + f_select_params + "))")
        elif (f_select == "extra_trees"):
            self.selector = eval("SelectFromModel(ExtraTreesClassifier(" + f_select_params + "))")
        elif (f_select == "random_forest"):
            self.selector = eval("SelectFromModel(RandomForestClassifier(" + f_select_params + "))")
        elif (f_select == "from_model"):
            self.selector = eval("SelectFromModel(" + f_select_params + ")")

    def fit(self, X, Y):
        Xtr = X
        if (self.classification):
            Xtr = [self.transform_features(i) for i in X]
            Xtr = self.feat_transformer.fit_transform(Xtr)
            Xtr = self.var_selector.fit_transform(Xtr)
            Y = self.label_encoder.fit_transform(Y)
            Xtr = self.selector.fit_transform(Xtr, Y)
        self.model.fit(Xtr,Y)

    def transform(self, Y):
        """ For evaluation, we want transformed predicted values. """
        return self.label_encoder.transform(Y)

    def predict(self, X):
        if not isinstance(X, list) and not isinstance(X, numpy.ndarray):
            sys.stderr.write("Warning: X is not a list (type=%s)\n" % (str(type(X))))
            #raise ValueError(X)
        if (self.classification):
            Xtr = [self.transform_features(i) for i in X]
            Xtr = self.feat_transformer.transform(Xtr)
            Xtr = self.var_selector.transform(Xtr)
            Xtr = self.selector.transform(Xtr)
            return self.label_encoder.inverse_transform(self.model.predict(Xtr))
        else:
            return self.model.predict(Xtr)
    
    def predict_proba(self, X):
        if not isinstance(X, list) and not isinstance(X, numpy.ndarray):
            sys.stderr.write("Warning: X is not a list (type=%s)\n" % (str(type(X))))
            #raise ValueError(X)
        if (self.classification):
            Xtr = [self.transform_features(i) for i in X]
            Xtr = self.feat_transformer.transform(Xtr)
            Xtr = self.var_selector.transform(Xtr)
            Xtr = self.selector.transform(Xtr)
            return self.model.predict_proba(Xtr)
        else:
            return self.model.predict_proba(Xtr)

    def score(self, X, Y):
        if (self.classification):
            Xtr = [self.transform_features(i) for i in X]
            Xtr = self.feat_transformer.transform(Xtr)
            Xtr = self.var_selector.transform(Xtr)
            Xtr = self.selector.transform(Xtr)
            Y = self.label_encoder.transform(Y)
        return self.model.score(Xtr, Y)

    def get_classes(self):
        return self.label_encoder.inverse_transform(self.model.classes_)

    def set_params(self, **params):
        self.model = self.model.set_params(**params)
        return self

    def get_params(self, deep=True):
        return self.params
 
    def print_params(self, file_path):
        f = open(file_path, "w")
        if (self.model.__class__.__name__ == "DecisionTreeClassifier"):
            f = tree.export_graphviz(self.model, out_file=f)
        f.close()

    def transform_features(self, features):
        """ Transform features with string values into new sets of features. """
        transformed = dict()
        if not self.binarize:
            return features
        for name, value in features.iteritems():
            if isinstance(value, basestring):
                name = "%s_%s" % (name,value)
                value = 1.
            transformed[name] = float(value)
        return transformed

    def setVerbose(self):
        self.verbose = True
Exemplo n.º 39
0
# In[13]:

constant_filter.get_support().sum()

# In[14]:

constant_list = [not temp for temp in constant_filter.get_support()]
constant_list

# In[15]:

x.columns[constant_list]

# In[16]:

x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

# In[17]:

x_train_filter.shape, x_test_filter.shape, x_train.shape

# ## Quesi Constant Feature Removal
# * These have large output removal from the subset
# * It's over load to Machine Learning Model

# In[18]:

quesi_constant_filter = VarianceThreshold(threshold=0.01)
quesi_constant_filter.fit(x_train_filter)
Exemplo n.º 40
0
X_train_T = y.T
y_train = pd.DataFrame(X_train_T)
X_test_T = y1.T
y_test = pd.DataFrame(X_test_T)
#X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0,stratify = y)
##constant feature removall

constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)

print(constant_filter.get_support().sum())

constant_list = [not temp for temp in constant_filter.get_support()]
print(constant_list)
print(X.columns[constant_list])
X_train_filter = constant_filter.transform(X)
X_test_filter = constant_filter.transform(X_test)
print(X_train_filter.shape)
print(X_test_filter.shape)
print(X.shape)

##Quasi constant feature removal
quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(X_train_filter)
print(quasi_constant_filter.get_support().sum())
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
X_test_quasi_filter = quasi_constant_filter.transform(X_test_filter)

print(X_train_quasi_filter.shape)
print(X_test_quasi_filter.shape)
sel.fit(x_train)
sum(sel.get_support())
#another way
len(x_train.columns[sel.get_support()])

print(
    len([
        x for x in x_train.columns
        if x not in x_train.columns[sel.get_support()]
    ]))

[x for x in x_train.columns if x not in x_train.columns[sel.get_support()]]

x_train['ind_var2_0'].unique()

x_train = sel.transform(x_train)
x_test = sel.transform(x_test)

#short adn easy

constant_features = [
    feat for feat in x_train.columns if x_train[feat].std() == 0
]
len(constant_features)

x_train.drop(labels=constant_features, axis=1, inplace=True)

x_test.drop(labels=constant_features, axis=1, inplace=True)

#for categorical Variables
Exemplo n.º 42
0
def feature_selector(x_train , x_test):

	vt = VarianceThreshold (0.01)
	vt.fit(x_train)

	return vt.transform(x_train) , vt.transform(x_test)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
#lightGBM parameters

filename = "train_classify.csv"
data = pd.read_csv(filename, index_col=0, nrows=10)
float_cols = [c for c in data if data[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}
data = pd.read_csv(filename, index_col=0, dtype=float32_cols)
x = data.values[:, :-1]
y = data.label
print("pre", x.shape)
scaler = VarianceThreshold()
scaler.fit(x)
x = scaler.transform(x)

stdScaler = StandardScaler()
stdScaler.fit(x)
x = stdScaler.transform(x)

print("after", x.shape)

xtrain, xtest, ytrain, ytest = train_test_split(x,
                                                y,
                                                test_size=0.3,
                                                random_state=420)

lgb_reg_params = {
    'learning_rate': 0.2,
    'max_depth': 10,
def do_t_recur(t_data, filenames, mode):
	# FEATURE SELECTION
	# Scale, Use VarianceThreshold and Pearson Correlation, and Select From RF Model
	scaler = MinMaxScaler()
	thresholding = VarianceThreshold()
	fs_data = []
	for i, d in enumerate(t_data):
		print("\nFILENAME: {}".format(filenames[i]))
		t_rows = list(d.index)
		t_columns = d.columns[:-3]
		# Replace NaN values with the column mean
		t_data[i]['Recurrence'].fillna((t_data[i]['Recurrence'].mean()), inplace=True)
		# Scale
		fs_data.append(pd.DataFrame(scaler.fit_transform(t_data[i].iloc[:, :-3]), columns=t_columns, index=t_rows))
		if mode == 'show':
			print("Scaling data -\n", fs_data[i].head())
		# Variance Threshold
		selector = thresholding.fit(fs_data[i])
		t_columns = t_columns[thresholding.get_support()]
		fs_data[i] = pd.DataFrame(thresholding.transform(fs_data[i]), columns=t_columns, index=t_rows)
		if mode == 'show':
			print("After variance thresholding -\n", fs_data[i].head())
		# Select From RF
		classifier = RandomForestClassifier(n_estimators=1)
		classifier = classifier.fit(fs_data[i], d['Recurrence'])
		selector = SelectFromModel(classifier, prefit=True)
		t_columns = t_columns[selector.get_support()]
		fs_data[i] = pd.DataFrame(selector.transform(fs_data[i]), columns=t_columns, index=t_rows)
		fs_data[i]['Recurrence'] = d['Recurrence']
		if mode in ('show'):
			print("Selecting data from RF model -\n", fs_data[i].head())
		print("Shape after feature selection: {}".format(fs_data[i].shape), end="\n\n")
	# RESAMPLING data - SMOTEENN
	balanced_data = [[] for _ in range(2)]
	for i, d in enumerate(fs_data):
		sme = SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=2))
		x, y = sme.fit_resample(fs_data[i], t_data[i]['Recurrence'])
		# x are the features and y are the targets
		balanced_data[i].append(x)
		balanced_data[i].append(y)
		print("Upsampling the data... in {}".format(filenames[i]))
		if mode == 'show':
			print("FILENAME: {}".format(filenames[i]), Counter(balanced_data[i][1]))
	# DIMENSIONALITY REDUCTION
	# Kernel PCA (can be toggled on or off)
	pca = True
	pca_dim = 20
	if pca:
		dr_data = []
		for i in range(len(filenames)):
			print("\nFILENAME: {}".format(filenames[i]))
			decomposer = KernelPCA(n_components=pca_dim, kernel='rbf', gamma=0.5, degree=7)
			dr_data.append(decomposer.fit_transform(balanced_data[i][0]))
			print("Shape and type after PCA: ", dr_data[i].shape, type(dr_data[i]))
	else:
		dr_data.append(balanced_data[0][0])
		dr_data.append(balanced_data[1][0])
	# CLASSIFICATION
	splits = 10
	seed = 7
	kfold = KFold(n_splits=splits, random_state=seed, shuffle=True)
	results = {'SVM': [],
				'RF': [],
				'KNN': [],
				'NB': []
				}
	for i, d in enumerate(dr_data):
		# SVM
		res = []
		classifier = SVC(gamma='auto')
		results['SVM'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['SVM'][i] = results['SVM'][i].mean()
		# RF
		# rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,max_depth=10,max_features='auto')
		classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=7, max_features='auto', criterion='gini', n_jobs=-1)
		results['RF'].append(cross_val_score(classifier, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['RF'][i] = results['RF'][i].mean()
		# KNN
		k_scores = []
		for n in range(1, 16):
			knn = KNeighborsClassifier(n_neighbors=3)
			scores = (cross_val_score(knn, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
			k_scores.append(scores.mean())
		results['KNN'].append(max(k_scores))
		# NB
		nb = GaussianNB()
		results['NB'].append(cross_val_score(nb, pd.DataFrame(dr_data[i]), balanced_data[i][1], cv=kfold))
		results['NB'][i] = results['NB'][i].mean()
	print("\nFinal Results for datasets: {0}, {1} -".format(filenames[0], filenames[1]))
	pprint(results)
	# PLOTTING
	# PCA
	pca = PCA(n_components = 3)
	x_pca = pca.fit_transform(balanced_data[0][0])
	fig = plt.figure(figsize=(13, 7))
	plt.suptitle("3-D plot for resampled data using dimesnionality reduction (Biomedical Recurrence)\n\n")
	ax = fig.add_subplot(111, projection='3d')
	ax.set_title("PCA\n\n")
	ax.view_init(elev=177,azim=-96)
	for i in range(len(balanced_data[0][1])):
		if balanced_data[0][1][i] == 0:
			false = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='y', label=balanced_data[0][1][i])
		elif balanced_data[0][1][i] == 1:
			true = ax.scatter(x_pca[i][0], x_pca[i][1], x_pca[i][2], c='g', label=balanced_data[0][1][i])
	plt.legend((false, true),
		("Didn't recur", "Recurred"),
		scatterpoints=1,
		loc='upper right',
		ncol=1,
		fontsize=10)
	#plt.show()
	return results
Exemplo n.º 45
0
def trainclassifier(feat_m1,
                    patientinfo,
                    config,
                    parameter_file,
                    output_svm,
                    output_json,
                    feat_m2=None,
                    feat_m3=None,
                    verbose=True):
    # Load variables from the config file
    config = config_io.load_config(config)
    if type(parameter_file) is list:
        parameter_file = ''.join(parameter_file)

    if type(patientinfo) is list:
        patientinfo = ''.join(patientinfo)

    if type(config) is list:
        config = ''.join(config)

    with open(parameter_file) as data_file:
        parameters = json.load(data_file)

    label_type = config['Genetics']['mutation_type']

    # Read the features and classification data
    image_features_select, labels, label_data =\
        readdata(feat_m1, feat_m2, feat_m3, patientinfo,
                 label_type, parameters)

    # Delete features which are are the same in more than 99% of patients
    # TODO: Separate this into a different tool
    sel = VarianceThreshold(threshold=0.99 * (1 - 0.99))
    sel = sel.fit(image_features_select)
    image_features_select = sel.transform(image_features_select)
    labels = sel.transform(labels).tolist()[0]

    # If we have too few features left, don't proceed
    if len(image_features_select[1]) > 7:

        # Create tempdir name from parameter file name
        basename = os.path.basename(parameter_file)
        filename, _ = os.path.splitext(basename)
        path = parameter_file
        for i in range(4):
            # Use temp dir: result -> sample# -> parameters - > temppath
            path = os.path.dirname(path)

        _, path = os.path.split(path)
        path = os.path.join(path, 'trainclassifier', filename)

        # Construct the required classifier
        classifier, param_grid =\
            cc.construct_classifier(config,
                                    image_features_select[0])

        # For N_iter, perform k-fold crossvalidation
        if config['Classification']['fastr']:
            trained_classifier = cv.crossvalfastr(config, label_data,
                                                  image_features_select,
                                                  classifier, param_grid, path)
        else:
            trained_classifier = cv.crossval(config, label_data,
                                             image_features_select, classifier,
                                             param_grid, path)
        # Add labels to dataframe
        # TODO: Works only if single mutation is present
        labels_pd =\
            pd.Series([labels],
                      index=[trained_classifier.keys()[0]],
                      name='feature_labels')
        classifier = classifier.append(labels_pd)

        # Calculate statistics of performance
        statistics = plot_single_SVM(classifier, label_data)

    else:
        statistics = "None"

        labels = ["Too Few Features."]
        feat = ["None"]

        panda_dict = dict(zip(labels, feat))

        classifier = pd.Series(panda_dict)

    # Save output
    savedict = dict()
    savedict["Parameters"] = parameters
    savedict["Statistics"] = statistics

    print("Saving data!")
    if type(output_svm) is list:
        output_svm = ''.join(output_svm)

    if type(output_json) is list:
        output_json = ''.join(output_json)

    # TODO: ouptu_svm/json are list objects!
    classifier.to_hdf(output_svm, 'SVMdata')
    with open(output_json, 'w') as fp:
        json.dump(savedict, fp, indent=4)
    def perform_variance_threshold(self, v_threshold):
        selector = VarianceThreshold(v_threshold)
        self.train_x = selector.fit_transform(self.train_x, self.train_y)

        self.test_x = selector.transform(self.test_x)
Exemplo n.º 47
0
def get_mapper(dataframe):
    beta = 0.0
    opt = Nadam(lr=0.001)
    print(dataframe.head(10))
    x_train, x_test = train_test_split(dataframe,
                                       random_state=6,
                                       test_size=0.2)
    scaler = MinMaxScaler()
    var_thresh = VarianceThreshold()
    var_thresh = var_thresh.fit(x_train)
    x_train = var_thresh.transform(x_train)
    x_test = var_thresh.transform(x_test)
    scaler = scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    input = Input(x_train.shape[1:])

    batch_norm_1 = BatchNormalization()
    batch_norm_2 = BatchNormalization()
    batch_norm_3 = BatchNormalization()
    batch_norm_4 = BatchNormalization()
    batch_norm_5 = BatchNormalization()
    batch_norm_6 = BatchNormalization()
    batch_norm_7 = BatchNormalization()
    batch_norm_8 = BatchNormalization()
    batch_norm_9 = BatchNormalization()
    batch_norm_10 = BatchNormalization()
    batch_norm_11 = BatchNormalization()
    batch_norm_12 = BatchNormalization()
    batch_norm_neck = BatchNormalization()

    dense_input = Dense(x_train.shape[1:][0], kernel_regularizer=l2(beta))
    dense_1 = Dense(int(x_train.shape[1:][0] / 2), kernel_regularizer=l2(beta))
    dense_2 = Dense(int(x_train.shape[1:][0] / 4), kernel_regularizer=l2(beta))
    dense_3 = Dense(256, kernel_regularizer=l2(beta))
    dense_4 = Dense(128, kernel_regularizer=l2(beta))
    dense_5 = Dense(64, kernel_regularizer=l2(beta))
    dense_6 = Dense(64, kernel_regularizer=l2(beta))
    dense_7 = Dense(128, kernel_regularizer=l2(beta))
    dense_8 = Dense(256, kernel_regularizer=l2(beta))
    dense_9 = Dense(int(x_train.shape[1:][0] / 4), kernel_regularizer=l2(beta))
    dense_10 = Dense(int(x_train.shape[1:][0] / 2),
                     kernel_regularizer=l2(beta))
    dense_11 = Dense(x_train.shape[1:][0], kernel_regularizer=l2(beta))
    desc_decoder = Dense(x_train.shape[1:][0],
                         activation="linear",
                         kernel_regularizer=l2(beta))
    neck = Dense(3, kernel_regularizer=l2(beta))
    p_relu = PReLU()
    p_relu2 = PReLU()
    p_relu3 = PReLU()
    p_relu4 = PReLU()
    p_relu5 = PReLU()
    p_relu6 = PReLU()
    p_relu7 = PReLU()
    p_relu8 = PReLU()
    p_relu9 = PReLU()
    p_relu10 = PReLU()
    p_relu11 = PReLU()
    p_relu12 = PReLU()
    p_relu_neck = PReLU()

    layer1 = batch_norm_1(p_relu(dense_input(input)))
    layer2 = batch_norm_2(p_relu2(dense_1(layer1)))
    layer3 = batch_norm_3(p_relu3(dense_2(layer2)))
    neck_out = p_relu_neck(neck(layer3))
    layer10 = batch_norm_10(p_relu4(dense_9(batch_norm_neck(neck_out))))
    layer11 = batch_norm_11(p_relu5(dense_10(layer10)))
    layer12 = batch_norm_12(p_relu6(dense_11(layer11)))

    decoded_descs = desc_decoder(layer12)

    autoencoder = Model(input, decoded_descs)
    print(autoencoder.summary())
    plot_model(autoencoder, to_file='model_graph.png')

    autoencoder.compile(optimizer=opt, loss='mean_squared_error')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.7,
                                  patience=3,
                                  min_lr=0,
                                  verbose=1,
                                  epsilon=0.00001)
    earlystopping = EarlyStopping(monitor='val_loss',
                                  min_delta=0.00001,
                                  patience=15,
                                  verbose=1,
                                  mode='auto')
    # Save the model for best validation loss
    checkpointer = ModelCheckpoint(filepath='checkpoint.h5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True)

    model_history_tmp = autoencoder.fit(
        x_train,
        x_train,
        validation_data=(x_test, x_test),
        epochs=10000,
        batch_size=32,
        callbacks=[checkpointer, earlystopping, reduce_lr],
        shuffle=False,
        verbose=0)

    plot_train_history(model_history_tmp, 'compressor_0_1', '')

    # load the best model base on validation results for this fold
    autoencoder = load_model('checkpoint.h5')

    latent_to_map = Model(input, neck_out)
    latent_to_map.save('smi2lat.h5')

    return latent_to_map, var_thresh, scaler
cols.remove('wheezy-copper-turtle-magic')
oof = np.zeros(len(train))
preds = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for i in range(512):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic'] == i]
    test2 = test[test['wheezy-copper-turtle-magic'] == i]
    idx1 = train2.index
    idx2 = test2.index
    train2.reset_index(drop=True, inplace=True)

    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])

    # STRATIFIED K-FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3, train2['target']):

        # MODEL AND PREDICT WITH QDA
        clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
        clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1]
        preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

    #if i%64==0: print(i)

# PRINT CV AUC
    X_train, X_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size=0.4,
                                                        random_state=123)

    #select from model
    clf_feat = ExtraTreesClassifier()
    clf_feat.fit(X_train, y_train)
    model = SelectFromModel(clf_feat, prefit=True)
    #X_train = model.transform(X_train)
    #X_test = model.transform(X_test)
    #select with variance
    sel = VarianceThreshold(threshold=(0.6 * (1 - 0.6)))

    X_train = sel.fit_transform(X_train)
    X_test = sel.transform(X_test)

    #test = SelectKBest(score_func=chi2,k=10)
    #fit = test.fit(X_train,y_train)
    #X_train = fit.transform(X_train)
    #X_test = fit.transform(X_test)

    print('shape of train data:', X_train.shape)
    print('shape of test data:', X_test.shape)

    #check for class imbalance in train & test_size
    print('positive class in train', np.sum(y_train) / y_train.shape[0])
    print('positive class in test', np.sum(y_test) / y_test.shape[0])

    #default classifer = logistic regression
    clf1 = []
Exemplo n.º 50
0
allData = hstack([data_prepared_numSparse, catArray])

y = sss[label].values
y = y.flatten()

# remove categorical variables with low variance
selector_variance = VarianceThreshold(threshold=.0025)
selector_variance.fit(allData)
c = selector_variance.get_support(indices=False)
d = selector_variance.get_support(indices=True)

featureItemize = featureNumToName.items()
featureItemize = [x for x, z in zip(featureItemize, c) if (z == 1)]
featureNumToName2 = dict([(i, x[1]) for i, x in enumerate(featureItemize)])

allDataVarThreshold = selector_variance.transform(allData)

# Perform l1 feature selection
clf_l = linear_model.LogisticRegression(C=.07,
                                        penalty='l1',
                                        tol=1e-6,
                                        max_iter=500)
std_scaler = StandardScaler()
allDataScaled = std_scaler.fit_transform(allDataVarThreshold.toarray())

clf_l.fit(allDataScaled, y)

selector_l1 = SelectFromModel(clf_l, prefit=True)
c = selector_l1.get_support(indices=False)
d = selector_l1.get_support(indices=True)
Exemplo n.º 51
0
        score.append(once)
    plt.plot(threshold, score)
    plt.show()

# In[]:
# Wrapper包装法:
from sklearn.feature_selection import RFE

RFC_ = RFC(n_estimators=10, random_state=0)

# 迭代法: 每次迭代删除50个特征
selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y)
selector.support_.sum()  # 返回所有的特征的是否最后被选中的布尔矩阵。 340
selector.ranking_  # 返回特征的按数次迭代中综合重要性的排名

X_wrapper = selector.transform(X)
cross_val_score(RFC_, X_wrapper, y, cv=5).mean()

# In[]:
# 学习曲线:
# ======【TIME WARNING: 15 mins】====== #
score = []
for i in range(1, 751, 50):
    X_wrapper = RFE(RFC_, n_features_to_select=i, step=50).fit_transform(X, y)
    once = cross_val_score(RFC_, X_wrapper, y, cv=5).mean()
    score.append(once)
plt.figure(figsize=[20, 5])
plt.plot(range(1, 751, 50), score)
plt.xticks(range(1, 751, 50))
plt.show()
#     test[col] = sc.transform(test[col].values)

### create feature matrix and target vector

X = train.drop(["id", "loss"], axis=1).as_matrix()
y = np.array(train["loss"].values)

### Feature reduction (optional)

sel = VarianceThreshold()

sel.fit(X, y)

print "Train before removing low-variance features", X.shape

X = sel.transform(X)

print "Train after removing low-variance features", X.shape

### define models and hyperparameters

lr = LinearRegression()
br = BayesianRidge()
net = ElasticNetCV(l1_ratio=[.1, .7, .95, .99, 1], normalize=False)
rf = RandomForestRegressor(n_estimators=75)

### build neural net model

early_stopping = EarlyStopping(monitor='val_loss', patience=2, mode="auto")

X_train, X_val, y_train, y_val = train_test_split(X,
Exemplo n.º 53
0
def main():
    args = getOptions()
    print args
    if args.model == 'gBoosting':
        fn = ("submission_%s_gBoosting_%s_%s_%s_%s_%s.csv" % (args.fts, args.loss, str(args.minsamplessplit), str(args.lrate).replace('.','dian'),str(args.nest),str(args.maxdepth)))
    elif args.model == 'randomForest':
        fn = ("submission_%s_randomForest_%s.csv" % (args.fts, args.nest))
    print fn
    print "train file read"
    train_x, train_y = readfile_noid(args.train,'train')
    train_x_new, id = extractID(train_x)
    train_x_clean, contentdict = cityclean(train_x_new)

    del id
    print "test file read"
    test_x, test_y = readfile_noid(args.test,'test')
    test_x_new, id = extractID(test_x)
    test_x_clean, contentdict = cityclean(test_x_new, contentdict)
    
    #remove feature with no distinction and less important
    print "remove feature with no distinction and less important"
    
    sel = VarianceThreshold()
    train_x_uniq = sel.fit_transform(train_x_clean)
    test_x_uniq = sel.transform(test_x_clean)
#     indices = [i for i in range(len(train_x[0]))]
#     frqIndex = trimfrq(train_x)
#     for i in frqIndex:
#         indices.remove(i)
#     train_x_uniq = indexTodata(train_x, indices)
#     test_x_uniq = indexTodata(test_x, indices)

    #normalization
    print "normalization"
    train_x_nor, mean, std = normalize(train_x_uniq)
    test_x_nor, mean, std = normalize(test_x_uniq, mean, std)

    #feature selection
    print "feature selection"
    if args.fts == 'cor':
        train_x_sel, test_x_sel = correlationSelect(train_x_nor, train_y, test_x_nor)
    elif args.fts == 'extraTrees':
        train_x_sel, test_x_sel = ExtraTreesSelect(train_x_nor, train_y, test_x_nor)
    elif args.fts == 'randomTree':
        train_x_sel, test_x_sel = randomTreesSelect(train_x_nor, train_y, test_x_nor)
    else:
        train_x_sel = copy.deepcopy(train_x_nor)
        test_x_sel = copy.deepcopy(test_x_nor)
    print len(train_x_nor[0])
    print len(train_x_sel[0])
    
    del train_x_nor, test_x_nor, train_x_uniq, test_x_uniq
    print "modelsing"
    if args.model == 'gBoosting':
        clf = GradientBoostingClassifier(loss=args.loss, 
                                         learning_rate=args.lrate,
                                         n_estimators=args.nest,
                                         max_depth=args.maxdepth,
                                         min_samples_split=args.minsamplessplit,
                                         verbose=1)
    elif args.model == 'randomForest':
        clf = RandomForestClassifier(n_estimators=args.nest, class_weight='auto')
    clf.fit(train_x_sel, train_y)
    train_pdt = clf.predict(train_x_sel)
    MCC, Acc_p , Acc_n, Acc_all = get_Accs(train_y, train_pdt) 
    print "MCC, Acc_p , Acc_n, Acc_all(train): "
    print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))
    test_pdt = clf.predict_proba(test_x_sel)
#     MCC, Acc_p , Acc_n, Acc_all = get_Accs(test_y, test_pdt) 
#     print "MCC, Acc_p , Acc_n, Acc_all(test): "
#     print "%s,%s,%s,%s" % (str(MCC), str(Acc_p) , str(Acc_n), str(Acc_all))   
    
    fout=open(fn,'w')
    fout.write("ID,target\n")
    for index, eachline in enumerate(test_pdt):
        fout.write("%s,%s\n" % (str(int(id[index])),str(test_pdt[index][1])))
    fout.close()
Exemplo n.º 54
0
def sample_data(X, Y, value=0):
    XX=[]
    for i in xrange(len(Y)):
        if Y[i]==value:
            XX.append(X[i])
    return XX

out=open(sys.argv[1],"r")
model=svm.OneClassSVM(kernel='rbf')
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sample_data(sel.fit_transform(X),Y, 1))
warning("useful features dim: "+str(len(sel.get_support(True))))
if hasattr(model,'score'):
    warning("accuracy on training set: "+str(model.score(sel.transform(X), Y)))
    if len(sys.argv)>2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y)))

    if len(sys.argv)>3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y)))

if len(sys.argv)>4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
    Z = model.decision_function(sel.transform(X)).tolist()
    Z = (Z-ref.mean())/ref.std()
    for i in xrange(len(Y)):
        print('S'+str(Y[i])+' '+str(Z[i]))
# test_with_new_data.py
# This python script will first train the svm with training data set
# then test it against the training data set provided
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import VarianceThresholdTest
import dataframe

train_x, train_y = dataframe.get_dataset_from_file('proper.train.data')
test_x, test_y = dataframe.get_dataset_from_file('corrected')

v_threshold = 0.15
debug = True

selector = VarianceThreshold(v_threshold)
new_test_x = selector.fit(train_x)

new_test_y = selector.transform(test_x)

if debug:
    print 'After fit'
    print 'Train contains %d features' % len(new_test_x[0])
    print 'Test contains %d features' % len(new_test_y[0])
Exemplo n.º 56
0
 
sel.fit(X_train)  # fit finds the features with low variance
 
sum(sel.get_support()) # how many not quasi-constant?


# In[8]:


features_to_keep = X_train.columns[sel.get_support()]


# In[9]:


X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
 
X_train.shape, X_test.shape


# In[10]:


# sklearn transformations lead to numpy arrays
# here I transform the arrays back to dataframes
# please be mindful of getting the columns assigned
# correctly


# In[11]:
import matplotlib.pyplot as plt

#Import Data 
df = pd.read_csv(r'C:\Utkarsh\GIT\Python\PredictSatisfiedCustomers\Data\train.csv')
df_test = pd.read_csv(r'C:\Utkarsh\GIT\Python\PredictSatisfiedCustomers\Data\test.csv')

y  = df['TARGET']
df = df.drop('TARGET',axis=1)
df = df.drop('ID',axis=1)
df_test = df_test.drop('ID',axis=1)

#Dropping columns having least variance impact
sel2 = VarianceThreshold(threshold = .9)
np2 = sel2.fit_transform(df)
df = pd.DataFrame(np2)
np_test2 = sel2.transform(df_test)
df_prediction = pd.DataFrame(np_test2)

#Cross validation for removing over fitting
df_fit, df_eval, y_fit, y_eval= train_test_split( df, y, test_size=0.1, random_state=2 )

#First predictive model using XGboost 
xgboosting_model = xgb.XGBClassifier(missing=9999999999,max_depth = 5,n_estimators=100,
                                     learning_rate=0.1,nthread=4,subsample=.7)
xgboosting_model.fit(df_fit, y_fit)
predict_target = xgboosting_model.predict_proba(df_eval)[:,1]
validAUC = auc(y_eval, predict_target)
print("Accuracy with misssing value imputation"+validAUC)

#ROC curve and comparison with other models
names = ["etsc","abc","xgb","gbc"]
Exemplo n.º 58
0
# In[39]:


### if we sum over get_support, we get the number of features that are not constant


# In[178]:


sum(sel.get_support())


# In[179]:


x_train = sel.transform(x_train)
test = sel.transform(test)


# In[180]:


test.shape


# In[181]:


x_train.shape

Exemplo n.º 59
0
class Model(object):
    __metaclass__ = ABCMeta

    default_train_file = "train_subset.pickle"
    default_test_file = "test_subset.pickle"
    default_model_param_file = "model_param.pickle"

    def __init__(self, **kwargs):
        self.verbose = kwargs.get("verbose", 1)

        if self.verbose:
            print "Opening HTML zip file"
        self.__html_zip = zipfile.ZipFile(kwargs.get("html_cleaned_zip", config.html_cleaned_zip))

        self.__train_classes_filename = kwargs.get("train_file", self.default_train_file)
        self.__test_classes_filename = kwargs.get("test_file", self.default_test_file)

        self.__predict_classes_in_filename = kwargs.get("predict_in_file", None)
        self.__predict_classes_out_filename = kwargs.get("predict_out_file", None)

        self.__use_tfidf = kwargs.get("use_tfidx", False)
        self.__tfidf_transformer = None
        self.__use_variance_threshold = kwargs.get("use_variance_threshold", False)
        self.__variance_threshold = 0.8
        self.__variance_threshold_selector = None

        self.__model_param_filename = kwargs.get("model_param_file", self.default_model_param_file)

        self.__dtype = kwargs.get("dtype", np.float32)

        self.__filenames = []
        self.__contents = []
        self.__is_file_handle = True
        self.__class_vector = []

        if "vocabulary_file" in kwargs:
            self.__vocabulary = sorted(load_pickle(kwargs["vocabulary_file"]))
        else:
            self.__vocabulary = None
        self.__docmat = None

    def __load(self, filename, use_file_handles):
        self.__is_file_handle = use_file_handles

        if self.verbose:
            print "Reading data"

        with open(filename, "r") as pf:
            classes = pickle.load(pf)

        self.__file_names = classes.keys()
        self.__class_vector = np.empty(len(self.__file_names), dtype=self.__dtype)
        self.__content = []

        for i, f in enumerate(self.__file_names):
            self.__class_vector[i] = classes[f]

        def iterfn(f):
            if self.__is_file_handle:
                return self.__html_zip.open(f, "r")
            else:
                with self.__html_zip.open(f, "r") as zf:
                    return self.__content.append(zf.read())

        self.__contents = imap(iterfn, self.__file_names)

    def load_training_data(self, use_file_handles=True):
        self.__load(self.__train_classes_filename, use_file_handles)

    def load_testing_data(self, use_file_handles=True):
        self.__load(self.__test_classes_filename, use_file_handles)

    def load_prediction_data(self, use_file_handles=True):
        self.__load(self.__predict_classes_in_filename, use_file_handles)

    def save_prediction_data(self):
        if self.verbose:
            print "Writing data"

        classes = {}
        for i, f in enumerate(self.__file_names):
            classes[f] = 1 if self.__class_vector[i] >= 0.5 else 0

        with open(self.__predict_classes_out_filename, "w") as pf:
            pickle.dump(classes, pf, pickle.HIGHEST_PROTOCOL)

    def make_word_vectors(self):
        if self.verbose:
            print "Computing word vectors"

        if self.__vocabulary is None:
            cv = CountVectorizer(
                stop_words=config.common_words,
                input=("file" if self.__is_file_handle else "content"),
                dtype=self.__dtype,
            )
            self.__docmat = cv.fit_transform(self.__contents)
            self.__vocabulary = cv.vocabulary_
        else:
            cv = CountVectorizer(
                stop_words=config.common_words,
                input=("file" if self.__is_file_handle else "content"),
                dtype=self.__dtype,
                vocabulary=self.__vocabulary,
            )
            self.__docmat = cv.transform(self.__contents)

        if self.__tfidf_transformer is None:
            if self.__use_tfidf:
                self.__tfidf_transformer = TfidfTransformer()
                self.__docmat = self.__tfidf_transformer.fit_transform(self.__docmat)
        else:
            self.__docmat = self.__tfidf_transformer.transform(self.__docmat)

        print "BEFORE", self.__docmat.shape
        if self.__variance_threshold_selector is None:
            if self.__use_variance_threshold:
                self.__variance_threshold_selector = VarianceThreshold(
                    self.__variance_threshold * (1.0 - self.__variance_threshold)
                )
                self.__docmat = self.__variance_threshold_selector.fit_transform(self.__docmat)
        else:
            self.__docmat = self.__variance_threshold_selector.transform(self.__docmat)
        print "AFTER ", self.__docmat.shape

    def get_document_matrix(self):
        return self.__docmat

    def get_document_class_vector(self):
        return self.__class_vector

    def set_document_class_vector(self, class_vector):
        self.__class_vector = class_vector

    @abstractmethod
    def getstate(self):
        return {"vocabulary": self.__vocabulary, "var_thresh_sel": self.__variance_threshold_selector}

    @abstractmethod
    def setstate(self, state):
        self.__vocabulary = state["vocabulary"]
        self.__variance_threshold_selector = state["var_thresh_sel"]

    def save(self):
        if self.verbose:
            print "Saving model"

        with open(self.__model_param_filename, "w") as pf:
            pickle.dump(self.getstate(), pf, pickle.HIGHEST_PROTOCOL)

    def load(self):
        if self.verbose:
            print "Loading model"

        with open(self.__model_param_filename, "r") as pf:
            state = pickle.load(pf)
            self.setstate(state)

    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def test(self):
        pass

    @abstractmethod
    def predict(self):
        pass