Пример #1
0
def get_user_feature(feature_type,behavior,num_feature=800):
    X_train = get_features(feature_type,behavior)
    index = X_train.index
    # 对X进行降维
    Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type']
    print 'start selectKbest...'
    # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1]))
    percent = 0
    if feature_type == 'cat_id':
        percent = 60
    elif feature_type == 'brand_id':
        percent = 15
    elif feature_type == 'seller_id':
        percent = 20
    select = SelectPercentile(f_classif, percentile=percent)
    select.fit(X_train,Y)
    X_train = select.transform(X_train)

    print 'end select...'
    print 'write %s features to train file' % feature_type
    train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_train,index=index).to_csv(train_feature_file_name)

    # 用同样的列降维对应的测试集数据
    X_test = get_features(feature_type,behavior,is_train=False)
    index = X_test.index
    X_test = select.transform(X_test)
    # 写入文件
    print 'write %s features to test file' % feature_type
    test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_test,index=index).to_csv(test_feature_file_name)
    print 'end....'
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"):
    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Enrique training emails:", sum(labels_train)
    print "no. of Juan training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result

    # selector = SelectPercentile(f_classif, percentile=10)

    ## <Temporary hack for Lesson 3>
    selector = SelectPercentile(f_classif, percentile=1)

    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Пример #4
0
def predict(classifier_type="tree",selection="Univariate", f="1"):

	if (f=="1"):
		kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl"
		p = 1
		BIG_C = 0.001
	if (f=="2"):
		kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl"
		p = 5
		BIG_C = 0.1
	if (f=="3"):
		kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl"
		p = 25
		BIG_C = 2
 	dump_data = False
	kernel_type = "linear"
	(data_matrix, features, samples) = readData()
	x = data_matrix.data
	y = data_matrix.target
	target_names = data_matrix.target_names
	x_indices = np.arange(x.shape[-1])
	(m,n) = x.shape

	test = joblib.load("GS_pickles\imputed_test_data.pkl")
	test_x = np.array(test)
	(i,j) = test_x.shape
	print "Training matrix shape: %s,%s" %(m,n)
	print "Test matrix shape: %s,%s" %(i,j)

	trimmed_x = []
	trimmed_test_x = []

	if (selection=="Univariate"):
		selector = SelectPercentile(f_classif, percentile=p)
		selector.fit(x, y)
		# Trimming the matrix, now should contain x% of the 8650 features
		trimmed_x = selector.transform(x)
		trimmed_test_x = selector.transform(test_x)

	if (selection=="kclusters"):
		kcluster_flist = joblib.load(kc_fn)
		trimmed_x = np.take(x, kcluster_flist, axis=1)
		trimmed_test_x = np.take(test_x, kcluster_flist, axis=1)

	n_samples, n_features = trimmed_x.shape
	# Linear SVM classifier
	if (classifier_type=="SVM"):
		clf = svm.SVC(kernel=kernel_type, degree=3, probability=True)
	# Gaussian Naive Bayes classifier
	if (classifier_type=="NB"):
		clf = GaussianNB()
	clf.fit(trimmed_x,y)

	result = clf.predict(trimmed_test_x)
	return result
Пример #5
0
def eval(ds, testNum, p, splitProportion=0.2):
    #testNum=1
    #splitProportion=0.2
    
    allFeaturesF1=[]
    allFeaturesRecall=[]
    allFeaturesPrecision=[]
    
    featureSelctedF1=[]
    featureSelctedRecall = []
    featureSelctedPrecision = []
    
    for _ in range(testNum):
        tstdata, trndata = ds.splitWithProportion( splitProportion )
        X, Y = labanUtil.fromDStoXY(trndata)
        X_test, Y_test = labanUtil.fromDStoXY(tstdata)
        #localF1s = []
        #localRecalls = []
        #localPercisions = []
        for y, y_test in zip(Y, Y_test):
            if all(v == 0 for v in y):
                continue
            #clf = LinearSVC()#fit_intercept=True, C=p)
            #clf.sparsify()
            
            #clf = RandomForestClassifier()#criterion='entropy')
            #clf = tree.DecisionTreeClassifier()#max_depth=p)
            clf = AdaBoostClassifier()
            #clf = GradientBoostingClassifier()#, learning_rate=lr)
            #clf = ExtraTreesClassifier(n_estimators=p)
                        
            #svc = LinearSVC()
            #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2)
            selector = SelectPercentile(chooser, percentile=p)
            
            selector.fit(X, y)
            name = str(clf).split()[0].split('(')[0]
            clf.fit(selector.transform(X), y)
            pred = clf.predict(selector.transform(X_test))
            
            featureSelctedF1.append(metrics.f1_score(y_test, pred))
            featureSelctedRecall.append(metrics.recall_score(y_test, pred))
            featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) 
            
            clf.fit(X, y)
            pred = clf.predict(X_test)
            
            allFeaturesF1.append(metrics.f1_score(y_test, pred))
            allFeaturesRecall.append(metrics.recall_score(y_test, pred))
            allFeaturesPrecision.append(metrics.precision_score(y_test, pred))

    return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \
        np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \
        np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \
        name
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    # read a vector of documents from file(decoded) 
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    # read a vector of labels/authors from file(decoded) 
    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()


    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    # features_train,features_test  is a vector of sentences

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test) # no fitting here. So the idf is the one calculated initially
    # returns sparse matrix(N*M) where N = each document/sample, M gives tf*invdf weightage of current feature word in document. 
    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train) # select top k% best features using univariate statistical tests
    features_train_transformed = selector.transform(features_train_transformed).toarray() # select the columns based on the stats test
    features_test_transformed  = selector.transform(features_test_transformed).toarray() # do as above

    ### info on the data
    #print "no. of Chris training emails:", sum(labels_train)
    #print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Пример #7
0
    def reduce(self,percent_taken):

       #fits classifier chi2 for non-negative X, otherwise F-value (ANOVA)
        try:
            fited=SelectPercentile(chi2, percentile=percent_taken).fit(self.train_set, self.Y)
        except:
            fited=SelectPercentile(f_classif, percentile=percent_taken).fit(self.train_set, self.Y)

        self.fitted_reductor=fited

        self.train_set = fited.transform(self.train_set)
        self.test_set = fited.transform(self.test_set)
        print 'number of featute(s) selected: {0}\n'.format(len(self.test_set[0]))
Пример #8
0
def make_train_test(df_train, df_test):
    vectorizer = CountVectorizer()
    
    X_train = vectorizer.fit_transform(df_train['Phrase'].values)
    Y_train = df_train['Sentiment'].values
    X_test = vectorizer.transform(df_test['Phrase'].values)
    
    selector = SelectPercentile(f_classif, percentile=50)
    selector.fit(X_train, Y_train)
    features_train_transformed = selector.transform(X_train)
    features_test_transformed  = selector.transform(X_test)
    
    return features_train_transformed, Y_train, features_test_transformed
def preprocess_4(article_file, lable_file):
    # article_file = "pkl/2013_article.pkl"
    # lable_file = "pkl/2013_lable.pkl"

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, lables, test_size=0.1, random_state=42)

    # print features_train.shape
    # print features_test[0]
    # print features_test.shape


    ### text vectorization--go from strings to lists of numbers
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)

    # print "features_train_transformed is {}".format(features_train_transformed.shape)
    # print "features_test_transformed is {}".format(features_test_transformed.shape)
    # print "vectorizer time:", round(time()-t0, 3), "s"
    # print len(vectorizer.get_feature_names())

    ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    t0 = time()
    selector = SelectPercentile(f_classif, percentile=30)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    # print "features_train_transformed is {}".format(features_train_transformed.shape)
    # print "features_test_transformed is {}".format(features_test_transformed.shape)
    # print "selector time:", round(time()-t0, 3), "s"

    # print len(vectorizer.get_feature_names())
    # print vectorizer.get_feature_names()[0:-10]
    # print len(selector.scores_)

    return features_train_transformed, features_test_transformed, labels_train, labels_test
Пример #10
0
def main():

	#set the timer
	start = time.time()

	#load the data
	trainX = np.load('trainX.npy')
	testX = np.load('testX.npy')
	trainY = np.load('trainY.npy')
	testY = np.load('testY.npy')
	print('\n!!! Data Loading Completed !!!\n')

	#get the 1st digit zero and plot it
	zero = trainX[14].reshape(28, 28)
	plt.imshow(zero, cmap=cm.Greys_r)
	plt.savefig("original"+str(trainY[14])+".png")
	#plt.show()

	#apply kpca
	kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True)
	kpca.fit(trainX[0:3000])
	trainX_kpca = kpca.transform(trainX)
	testX_kpca = kpca.transform(testX)

	#do inverse transform and plot the result
	orig = kpca.inverse_transform(trainX_kpca)
	img = orig[14].reshape(28, 28)
	plt.imshow(img, cmap=cm.Greys_r)
	plt.savefig("reconstructed"+str(trainY[14])+".png")
	#plt.show()

	selector = SelectPercentile(f_classif, percentile=5)
	selector.fit(trainX_kpca, trainY)
	trainX = selector.transform(trainX_kpca)
	testX = selector.transform(testX_kpca)

	#fit a classifier
	parameters = {'n_neighbors' : list(np.arange(15)+1)}
	clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters)
	clf.fit(trainX, trainY)

	pred = clf.predict(testX)
	print accuracy_score(testY, pred)
	print confusion_matrix(testY, pred)
	#print(clf.best_params_)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def trainingPreprocess(words_file, authors_file):
    """
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        6 objects are returned:
            -- training/testing features
            -- training/testing labels
            -- a fitted vectorizer
            -- a fitted selector

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    
    ### feature selection
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    return features_train_transformed, features_test_transformed, labels_train, labels_test, vectorizer, selector
Пример #12
0
def selectFeatures(features, labels, features_list):
    '''
    Select features according to the 20th percentile of the highest scores. 
    Return a list of features selected  and a dataframe showing the ranking 
    of each feature related to their p values
    features: numpy array with the features to be used to test sklearn models
    labels: numpy array with the real output 
    features_list: a list of names of each feature
    '''
    #feature selection
    selector = SelectPercentile(f_classif, percentile=20)
    selector.fit(features, labels)
    features_transformed = selector.transform(features)
    #filter names to be returned
    l_rtn = [x for x, t in zip(features_list, 
        list(selector.get_support())) if t]
    # pd.DataFrame(features_transformed, columns = l_labels2).head()
    #calculate scores
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores))))
    df_rtn.columns = ["pValue_Max"]
    df_rtn = df_rtn.sort("pValue_Max", ascending=False)
    # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0])


    return l_rtn, df_rtn
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    selector = SelectPercentile(f_classif, percentile=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
Пример #14
0
def main():

    main_data = pd.read_csv('../data/train.csv', index_col='ID')

    output = []
    for x in main_data.columns:
        output.append({
            'variable': x,
            'variance': main_data.ix[:, x].var(),
            'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4),
            'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))}
        )

    # print csv for later in the presentation docs
    variable_selector = pd.DataFrame(output)
    variable_selector = variable_selector.set_index('variable')
    variable_selector = variable_selector.drop('TARGET')
    variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv')

    selector = SelectPercentile(f_classif, percentile=25)
    subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET']))

    subset.to_csv('../data/main_data.csv', index=False)
    main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False)

    # print transformed test data to csv
    test_data = pd.read_csv('../data/test.csv', index_col='ID')
    test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index)
    test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
Пример #15
0
def select_features(X,y):
    selector = SelectPercentile(f_classif, percentile=10)
    print "fit selector"
    selector.fit(X, y)
    print "transform features"
    X = selector.transform(X)
    return X,selector
Пример #16
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
Пример #17
0
def preprocess(words_file="../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """
    Take a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and preprocesses them.

    Preprocessor steps:
        - split into training/testing sets (10% testing)
        - vectorize into tfidf matrix
        - select/keep most helpful features

    After this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

    A tfidf matrix is defined as TF(t)*IDF(t) where
    TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).
    IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

    4 objects are returned:
        -- training/testing features
        -- training/testing labels
    """
    # the words (features) and authors (labels), already largely preprocessed this preprocessing will be repeated in the text learning mini-project
    print('words_file = {}'.format(words_file))
    word_data = pickle.load(open(words_file, "rb"))
    authors = pickle.load(open(authors_file, "rb"))

    # test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

    # text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    # feature selection, because text is super high dimensional and can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed = selector.transform(features_test_transformed).toarray()

    # info on the data
    print("no. of Chris training emails:", sum(labels_train))
    print("no. of Sara training emails:", len(labels_train) - sum(labels_train))

    return numpy.array(features_train_transformed), numpy.array(features_test_transformed), numpy.array(labels_train), numpy.array(labels_test)
Пример #18
0
def feature_transform(features_train, features_test, top_percent=1):
    """ Function to apply Bag of Words feature creator with TfIdf statistic 
        normalisation. The input is train and test text, and optional parameter
        'top_percent' which shows how many percent of super high dimensional
        text feature space is to return (defaul is 1%). 
        The output is the transformed train and test feature vectors suitable 
        to use with sklearn classifiers.
    """
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    
    ### Feature selection, because text is super high dimensional
    selector = SelectPercentile(f_classif, percentile=top_percent)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()
    return features_train_transformed, features_test_transformed
def univariate_feature_selection(dataset, features):
	# load the dataset
	spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx')
	data = Data(spreadsheet)
	targets = data.targets


	X = dataset
	y = data.targets


	###############################################################################
	plt.figure(1)
	plt.clf()

	X_indices = np.arange(X.shape[-1])

	###############################################################################
	# Univariate feature selection with F-test for feature scoring
	# We use the default selection function: the 10% most significant features
	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(X, y)
	scores = -np.log10(selector.pvalues_)
	scores /= scores.max()
	plt.bar(X_indices - .45, scores, width=.2,
	        label=r'Univariate score ($-Log(p_{value})$)', color='g')

	###############################################################################
	# Compare to the weights of an SVM
	clf = svm.SVC(kernel='linear')
	clf.fit(X, y)

	svm_weights = (clf.coef_ ** 2).sum(axis=0)
	svm_weights /= svm_weights.max()

	plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r')

	clf_selected = svm.SVC(kernel='linear')
	clf_selected.fit(selector.transform(X), y)

	svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
	svm_weights_selected /= svm_weights_selected.max()

	plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
	        width=.2, label='SVM weights after selection', color='b')


	x = np.arange(0, len(features))
	plt.title("Comparing feature selection")
	plt.xlabel('Feature number')
	plt.xticks(x, features, rotation=45)
	plt.yticks(())
	#plt.axis('tight')
	plt.legend(loc='upper right')
	plt.show()
Пример #20
0
def preprocess(X,y):
    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)


   ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    joblib.dump(vectorizer, 'vectorizer_intent.pkl')

   ### feature selection, because text is super high dimensional and
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    joblib.dump(selector, 'selector_intent.pkl')
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Пример #21
0
def Preprocess(words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl", labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl"):
    """ 
        this function takes a pre-made list of data texts (by default word_data.pkl)
        and the corresponding labels (by default label_data.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and labels (positive or negative)
    word_data = pickle.load( open(words_file, "r"))
    labels = pickle.load( open(labels_file, "r") )

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, labels, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, encoding='windows-1256')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(features_test)

    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of positive training files:", sum(labels_train)
    print "no. of negative training files:", len(labels_train)-sum(labels_train)

    return features_train_transformed, features_test_transformed, labels_train, labels_test        	
def preprocess(words_file = "../data/data.pkl", authors_file="../data/datalabels.pkl"):
   
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    features_train_vect = features_train
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)
    selector = SelectPercentile(f_classif, percentile=1)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()
    return features_train_vect , features_train_transformed, features_test_transformed, labels_train, labels_test
Пример #23
0
def convertText(trainData, trainLabel, testData, testLabel, reduceDimensionality=0):
    '''
    trainData: training data
    trainLabel: training labels
    testData: test data
    testLabel: test labels
    return numerical arrays of data from text vectors
    '''
    
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
    trainDataTransformed = vectorizer.fit_transform(trainData).toarray()
    testDataTransformed = vectorizer.transform(testData).toarray()
    
    if reduceDimensionality:
        selector = SelectPercentile(f_classif, percentile=0.10)
        selector.fit(trainDataTransformed, trainLabel)

        trainDataTransformed = selector.transform(trainDataTransformed).toarray()
        testDataTransformed = selector.transform(testDataTransformed).toarray()

    return trainDataTransformed, trainLabel, testDataTransformed, testLabel
Пример #24
0
def eval(ds, clf, splitProportion=0.2, p=4):
    #splitProportion = 0.2
    tstdata, trndata = ds.splitWithProportion( splitProportion )
    X, Y = labanUtil.fromDStoXY(trndata)
    X_test, Y_test = labanUtil.fromDStoXY(tstdata)
    f1s=[]
    ps =[]
    rs=[]
    for i, (y, y_test) in enumerate(zip(Y, Y_test)):
        if all(v == 0 for v in y):
            continue
        selector = SelectPercentile(chooser, percentile=p)
        selector.fit(X, y)
        name = str(clf).split()[0].split('(')[0]
        clf.fit(selector.transform(X), y)
        pred = clf.predict(selector.transform(X_test))
        f1 = metrics.f1_score(y_test, pred)
        f1s.append(f1)
        ps.append(metrics.precision_score(y_test, pred))
        rs.append(metrics.recall_score(y_test, pred))
    return f1s, ps, rs
Пример #25
0
class AnovaPercentileStep(SklearnStep):
    def __init__(self, percentile):
        super(AnovaPercentileStep, self).__init__()
        self._percentile = percentile

    def fit_transform(self):
        self._model = SelectPercentile(f_classif, self._percentile)
        x, y = load_svmlight(self.input_path)
        x = self._model.fit_transform(x, y)
        save_svmlight(x, y, self._output_path)

    def transform(self, x=None):
        if x is None:
            x, y = load_svmlight(self._test_input_path)
            x = self._model.transform(x)
            save_svmlight(x, y, self._test_output_path)
        else:
            transformed_x = self._model.transform(x)
            return transformed_x

    def get_param(self):
        return {'percentile': self._percentile}
Пример #26
0
def train(config, model_data, data, record):
    model_class_name, percentile = model_data
    model = instantiate_from_class_string(model_class_name)

    try:
        model.n_jobs = config['n_jobs']
    except:
        log.info('Cannot set n_jobs for this model...')

    record['model'] = model_name(model)
    record['parameters'] = model.get_params()
    record['feats_percentile'] = percentile

    train_x = data['train_x']
    train_y = data['train_y']
    test_x = data['test_x']

    # estimate accuracy using cross-validation
    model = make_pipeline(SelectPercentile(f_classif, percentile),
                          StandardScaler(), model)

    scores = cross_validation.cross_val_score(model, train_x,
                                              train_y, cv=5,
                                              scoring='accuracy')
    record['mean_acc'] = scores.mean()

    # predict on the test set
    fn = SelectPercentile(f_classif, percentile).fit(train_x, train_y)
    train_x = fn.transform(train_x)
    test_x = fn.transform(test_x)

    scaler = StandardScaler().fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    model.fit(train_x, train_y)
    ids = data['test_ids']
    preds = model.predict(test_x)
    record['test_preds'] = [(id_, pred) for id_, pred in zip(ids, preds)]
Пример #27
0
class FeatureSelection:
    """
    特征选择
    
    percentile:选取特征的百分比
    
    """
    def __init__(self,percentile=70):
        self.percentile=percentile
    def fit(self,x,y):
        self.sepChi=SelectPercentile(score_func=chi2,percentile=self.percentile)#使用卡方
        self.sepChi.fit(x,y)
    def transform(self,x,y):
        return (self.sepChi.transform(x),y)    
Пример #28
0
def preprocess_input(feature_test,words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl", labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl"):

    word_data = pickle.load( open(words_file, "r"))
    labels = pickle.load( open(labels_file, "r") )

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    ### split training & testing
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, labels, test_size=0.0, random_state=42)
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, encoding='windows-1256')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed = vectorizer.transform(feature_test)

    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print ("no. of positive training files:", sum(labels_train))
    print ("no. of negative training files:", len(labels_train)-sum(labels_train))


    return features_train_transformed, features_test_transformed, labels_train
Пример #29
0
def cross_val_score(clf, data, target, k):
	shuffle_arr = []
	size = len(data)
	for i in range(size):
		shuffle_arr.append(i)
	scores = []
	for i in range(0, k):
		#generate shuffled train and test dataset
		data_train_raw = []
		data_test_raw = []
		target_train = []
		target_test = []
		# seperate shuffled train and test dataset
                random.shuffle(shuffle_arr)
                shuffle_train = shuffle_arr[:size - size/k]
                shuffle_test = shuffle_arr[size-size/k :]
                for j in shuffle_train:
                        data_train_raw.append(data_total[j])
                        target_train.append(target[j])
                for r in shuffle_test:
                        data_test_raw.append(data_total[r])
                        target_test.append(target[r])

		data_train = data_process(data_train_raw)
		data_test = data_process(data_test_raw)

		# transform array of string to counts
		count_vect = CountVectorizer()
		X_train_counts = count_vect.fit_transform(data_train)
		# transform counts to frequencies
		tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
		X_train_tf = tf_transformer.transform(X_train_counts)
		
		# feature selection
		select = SelectPercentile(chi2, percentile = 10)
		X_train_fs = select.fit_transform(X_train_tf, target_train)
							
		# train the model
		clf_train = clf.fit(X_train_fs, target_train)

		# test the model
		X_new_counts = count_vect.transform(data_test)
		X_new_tfidf = tf_transformer.transform(X_new_counts)
		X_new_fs = select.transform(X_new_tfidf)
		test_result = clf_train.predict(X_new_fs)
		scores.append(GetPrecisionRecallF1(test_result, target_test))
		#clf_score =  clf_train.score(X_new_fs, target_test)
		#scores.append(clf_score)
	return scores
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif):
    df_lst = []
    for fset_name, df in feature_sets.items():
        X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1)
        y_train = df[df.partition == 'train'].fatality_ind
        df_X = df.drop(['partition', 'fatality_ind'], axis=1)
        if fs_fn == 'pct':
            featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile)
        else:
            featureSelector = SelectKBest(score_func=score_fn, k=nFeatures)
        featureSelector.fit(X_train, y_train)
        fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1))
        cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))]
        cols_fs_ref = [fset_name + ' ' + c for c in cols_fs]
        df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref)
        df_lst.append(df_fs)
    df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1))
    return df_comb
Пример #31
0

features_train = pre_process(features_train)
features_test = pre_process(features_test)


vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train)
features_test_transformed = vectorizer.transform(features_test)

#print features_train_transformed
#print features_train_transformed.shape
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features_train_transformed, labels_train)
features_train_transformed = selector.transform(features_train_transformed).toarray()
'''print '--------------------------------------------------------------------'
print features_train_transformed
print features_train_transformed.shape
print labels_train.shape'''
features_test_transformed = selector.transform(features_test_transformed).toarray()
'''print features_test_transformed.shape
print labels_test.shape
print type(features_train_transformed)
print type(labels_train)'''


clf = GaussianNB()
t0 = time()
clf.fit(features_train_transformed, labels_train)
print "training time:", round(time()-t0, 3), "s"
Пример #32
0
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(
    X_w_noise, cancer.target, random_state=0, test_size=.5)

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)

mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel('Sample index')
plt.show()


X_test_selected = select.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
print('Score with all features: {:.3f}'.format(lr.score(X_test, y_test)))
lr.fit(X_train_selected, y_train)
print('Score with selected features: {:.3f}'.format(lr.score(X_test_selected, y_test)))
Пример #33
0
features_cv_transformed=selector.transform(features_cv_transformed)
selected_word_indices = selector.get_support(indices=True)
vocab = vectorizer.get_feature_names()
trimmed_vocab = [vocab[i] for i in selected_word_indices]
print[trimmed_vocab]
'''
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words="english")
features_transformed = vectorizer.fit_transform(word_data)
features_transformed = features_transformed.toarray()
selector = SelectPercentile(f_classif, percentile=20)
selector.fit(features_transformed, sentiid)
features_transformed = selector.transform(features_transformed)
##################################################################################################################################################
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5)):
                       tokenizer=stem_comment,
                       max_features=5000)

dtm = vect.fit_transform(Xtrain)
words = vect.get_feature_names()
print('dtm matrix')

from sklearn.feature_selection import SelectPercentile, mutual_info_classif

selector = SelectPercentile(mutual_info_classif, percentile=20)
dtm_reduced = selector.fit_transform(dtm, ytrain)
selector_scores = selector.scores_
print('selected')

dtm_test = vect.transform(Xtest)
dtm_selected = selector.transform(dtm_test)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.preprocessing import label_binarize

model_random_forest = RandomForestClassifier()
model_random_forest.fit(dtm_reduced, ytrain)
prob_pred = model_random_forest.predict_proba(dtm_selected)
pred = model_random_forest.predict(dtm_selected)

cm_random_forest = confusion_matrix(ytest, pred)
cr_random_forest = classification_report(ytest, pred)

#%%
Пример #35
0
train = data[:train.shape[0]]
test = data[train.shape[0]:]
train_y = train['click']

cntv = CountVectorizer()
cntv.fit(train['user_tags'])
train_a = cntv.transform(train['user_tags'])
test_a = cntv.transform(test['user_tags'])
train_new = sparse.hstack(
    (train_new, train_a), 'csr'
)  #hstack : 将矩阵按照列进行拼接,对应的列数必须相等,hstack(blocks, format=None, dtype=None)
test_new = sparse.hstack((test_new, test_a), 'csr')
SKB = SelectPercentile(chi2, percentile=95).fit(
    train_new,
    train_y)  #区别:SelectKBest选择排名排在前n个的变量 SelectPercentile 选择排名排在前n%的变量
train_new = SKB.transform(train_new)
test_new = SKB.transform(test_new)
'''
在稀疏矩阵存储格式中:
# - COO 格式在构建矩阵时比较高效
# - CSC 和 CSR 格式在乘法计算时比较高效
A.todense()
# 可以转化为普通矩阵:
'''
#%%
#统计特征构造
#adid统计特征,不同种类数量(已经创建了记录数的统计,现在是一个特征对应另外一个特征的种类)
## 由于adid是次样本层级的粒度,是聚集到点击率的层面所以是重要的特征,adid基本与广告信息表一一对应,我们象征性的选择广告id与其他挑选出来的id进行特征nunique统计
adid_nuq = [
    'model', 'make', 'os', 'city', 'province', 'user_tags', 'app_id',
    'carrier', 'nnt', 'devtype', 'app_cate_id', 'inner_slot_id'
Пример #36
0
    def train_classifier_use_feature_selection(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print ("Shape of train data is "+str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func, percentile=_percentile)

        print ("Fitting data with feature selection ...")
        selector.fit(X_tfidf, y_train)

        # get how many features are left after feature selection
        X_features = selector.transform(X_tfidf)

        print ("Shape of array after feature selection is "+str(X_features.shape))

        clf = MultinomialNB(alpha=_alpha).fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file,'w')

        for fb in feature_boolean:
            f.write(str(fb)+'\n')

        f.close()


        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted')
        print ("Cross validation score: "+str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


        ####################
        #test clf on test data
        ####################

        X_test_CV = count_vect.transform(docs_test)

        print ("Shape of test data is "+str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        # apply feature selection on test data too
        X_test_selector = selector.transform(X_test_tfidf)
        print ("Shape of array for test data after feature selection is "+str(X_test_selector.shape))

        y_predicted = clf.predict(X_test_selector)

        # print the mean accuracy on the given test data and labels

        print ("Classifier score on test data is: %0.2f " % clf.score(X_test_selector,y_test))


        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
Пример #37
0
def benchmark(X_train, X_test, y5_train, y5_test, y3_train, y3_test, y2_train,
              y2_test, exp_folder, ds_folder, perc_f):

    config.logger.info('benchmark_text: ' + str(perc_f))

    try:

        subfolder = 'all/best_k/' + str(perc_f) + '/'
        path = OUTPUT_FOLDER + exp_folder + ds_folder + 'benchmark/' + subfolder

        #input_layer_neurons = len(X) + 1
        #output_layer_neurons = 1
        #hidden_nodes = np.math.ceil(len(X) / (2 * (input_layer_neurons + output_layer_neurons)))

        #X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y3, test_size=test_size, random_state=random_state)
        #X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=test_size, random_state=random_state)

        # just to double check...
        #assert np.all(X_train_5 == X_train_3)
        #assert np.all(X_train_5 == X_train_2)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        config.logger.debug('OK. feature selection')
        # feature selection
        best5 = SelectPercentile(f_regression, perc_f)
        best3 = SelectPercentile(f_classif, perc_f)
        best2 = SelectPercentile(f_classif, perc_f)

        X_train_best5 = best5.fit_transform(X_train, y5_train)
        X_test_best5 = best5.transform(X_test)
        #feature_names = ['a', 'b', 'c', 'd', 'e']
        #best_scores = best5.scores_
        #best_features_ordered = [feature_names[i] for i in np.argsort(best5.scores_)[::-1]]

        X_train_best3 = best3.fit_transform(X_train, y3_train)
        X_test_best3 = best3.transform(X_test)

        X_train_best2 = best2.fit_transform(X_train, y2_train)
        X_test_best2 = best2.transform(X_test)

        best_estimators = []

        x_axis_2 = []
        x_axis_3 = []
        y_axis_2 = []
        y_axis_3 = []

        title = 'Webpage Text Features'
        x_axis_label = 'Classifiers'
        y_axis_label = 'F1-measure'

        # --------------------------------------------------------------------------------------------------------------
        # regression experiment
        # --------------------------------------------------------------------------------------------------------------
        config.logger.info('starting experiments regression (5-classes)')

        with open(path + EXP_5_CLASSES_LABEL + '/log/results.txt',
                  "w") as file_log_regression:
            file_log_regression.write(HEADER_CLASSIFICATION)
            for estimator, hyperparam, grid_method in CONFIGS_REGRESSION:
                out = []
                label = estimator.__class__.__name__ + '.' + str(
                    perc_f) + '.' + EXP_5_CLASSES_LABEL
                out, best_estimator = train_test_export_save_per_exp_type(
                    estimator, label, hyperparam, grid_method, X_train_best5,
                    X_test_best5, y5_train, y5_test, EXP_5_CLASSES_LABEL, 0,
                    out, file_log_regression, subfolder, exp_folder, ds_folder)
            file_log_regression.flush()

        # --------------------------------------------------------------------------------------------------------------
        # classification experiment
        # --------------------------------------------------------------------------------------------------------------
        config.logger.info(
            'starting experiments classification (2-classes and 3-classes)')
        i = 1
        for exp_type in (EXP_2_CLASSES_LABEL, EXP_3_CLASSES_LABEL):
            with open(path + exp_type + '/log/results.txt',
                      "w") as file_log_classification:
                file_log_classification.write(HEADER_CLASSIFICATION)
                if exp_type == EXP_2_CLASSES_LABEL:
                    _X_train = X_train_best2
                    _X_test = X_test_best2
                    _y_train = y2_train
                    _y_test = y2_test
                    y_axis = y_axis_2
                    x_axis = x_axis_2
                    graph_file = 'graph.' + str(perc_f) + '.2-class.png'
                    threshold = THRESHOLD_LABEL_2class
                elif exp_type == EXP_3_CLASSES_LABEL:
                    _X_train = X_train_best3
                    _X_test = X_test_best3
                    _y_train = y3_train
                    _y_test = y3_test
                    y_axis = y_axis_3
                    x_axis = x_axis_3
                    graph_file = 'graph.' + str(perc_f) + '.3-class.png'
                    threshold = THRESHOLD_LABEL_3class
                else:
                    raise Exception('blah! error')

                for estimator, hyperparam, grid_method in CONFIGS_CLASSIFICATION:
                    out = []
                    label = estimator.__class__.__name__ + '.' + str(
                        perc_f) + '.' + exp_type
                    out, best_estimator = train_test_export_save_per_exp_type(
                        estimator, label, hyperparam, grid_method, _X_train,
                        _X_test, _y_train, _y_test, exp_type, 0, out,
                        file_log_classification, subfolder, exp_folder,
                        ds_folder)
                    best_estimators.append(
                        (estimator.__class__.__name__, best_estimator))
                    i += 1
                    y_axis.extend(np.array(out)[:, 2])
                    x_axis.append(
                        best_estimator.__class__.__name__.replace(
                            'Classifier', ''))

                #estimator_ensamble = VotingClassifier(estimators=best_estimators)
                #hyperparam_ensamble = dict(voting=['hard', 'soft'], flatten_transform=[True, False])

                #out = []
                #out, best_estimator = train_test_export_save_per_exp_type(estimator_ensamble, estimator_ensamble.__class__.__name__,
                #                                                          hyperparam_ensamble, SEARCH_METHOD_GRID,
                #                                          _X_train, _X_test, _y_train, _y_test, exp_type, 0,
                #                                          out, file_log_classification, subfolder, exp_folder, ds_folder)
                #y_axis.extend(np.array(out)[:, 2])
                #x_axis.append(best_estimator.__class__.__name__.replace('Classifier', ''))

                config.logger.info(
                    'experiments classification done! exporting charts...')
                export_chart_bar(x_axis, y_axis, graph_file, ds_folder,
                                 exp_folder, perc_f, exp_type, title,
                                 x_axis_label, y_axis_label, threshold)
                config.logger.info('charts exported!')
                file_log_classification.flush()

    except Exception as e:
        config.logger.error(repr(e))
        raise
target = pickle.load(open("../generated/group.p", "r"))
device_id = pickle.load(open("../generated/device_id.p", "r"))

trainDevices = pd.read_csv("../../../data/gender_age_train.csv",
                           usecols=["device_id"])
indexes = pd.read_csv("../generated/raddarIndices.csv")
indexes = pd.merge(trainDevices,
                   indexes,
                   how="left",
                   on="device_id",
                   left_index=True).reset_index().drop(["index"], axis=1)
######################
#   Feature Selection
######################
fs = SelectPercentile(chi2, percentile=23).fit(train, target)
train = fs.transform(train)
test = fs.transform(test)

##################
#   Pre Processing
##################
targetEncoder = LabelEncoder()
target = targetEncoder.fit_transform(target)
target = np_utils.to_categorical(target)


##################
#  Build Model
##################
def modelBuilder():
    model = Sequential()
def main():
    data_loc = sys.argv[1]

    stemmer = PorterStemmer()

    # Get data for all combos
    train_stem_stop = load_files_correctly(os.path.join(data_loc, 'Training'),
                                           stemmer=stemmer,
                                           stop=True)
    test_stem_stop = load_files_correctly(os.path.join(data_loc, 'Test'),
                                          stemmer=stemmer,
                                          stop=True)
    train_no_stem_stop = load_files_correctly(os.path.join(
        data_loc, 'Training'),
                                              stemmer=None,
                                              stop=True)
    test_no_stem_stop = load_files_correctly(os.path.join(data_loc, 'Test'),
                                             stemmer=None,
                                             stop=True)
    train_stem_no_stop = load_files_correctly(os.path.join(
        data_loc, 'Training'),
                                              stemmer=stemmer,
                                              stop=False)
    test_stem_no_stop = load_files_correctly(os.path.join(data_loc, 'Test'),
                                             stemmer=stemmer,
                                             stop=False)
    train_no_stem_no_stop = load_files_correctly(os.path.join(
        data_loc, 'Training'),
                                                 stemmer=None,
                                                 stop=False)
    test_no_stem_no_stop = load_files_correctly(os.path.join(data_loc, 'Test'),
                                                stemmer=None,
                                                stop=False)

    tfid_u_stem_stop = TfidfVectorizer(ngram_range=(1, 1),
                                       decode_error='ignore')
    tfid_u_stem_no_stop = TfidfVectorizer(ngram_range=(1, 1),
                                          decode_error='ignore')
    tfid_u_no_stem_stop = TfidfVectorizer(ngram_range=(1, 1),
                                          decode_error='ignore')
    tfid_u_no_stem_no_stop = TfidfVectorizer(ngram_range=(1, 1),
                                             decode_error='ignore')

    count_u_stem_stop = CountVectorizer(ngram_range=(1, 1),
                                        decode_error='ignore')
    count_u_stem_no_stop = CountVectorizer(ngram_range=(1, 1),
                                           decode_error='ignore')
    count_u_no_stem_stop = CountVectorizer(ngram_range=(1, 1),
                                           decode_error='ignore')
    count_u_no_stem_no_stop = CountVectorizer(ngram_range=(1, 1),
                                              decode_error='ignore')

    tfid_u_train_stem_stop = tfid_u_stem_stop.fit_transform(
        train_stem_stop.data)
    tfid_u_test_stem_stop = tfid_u_stem_stop.transform(test_stem_stop.data)

    tfid_u_train_stem_no_stop = tfid_u_stem_no_stop.fit_transform(
        train_stem_no_stop.data)
    tfid_u_test_stem_no_stop = tfid_u_stem_no_stop.transform(
        test_stem_no_stop.data)

    tfid_u_train_no_stem_stop = tfid_u_no_stem_stop.fit_transform(
        train_no_stem_stop.data)
    tfid_u_test_no_stem_stop = tfid_u_no_stem_stop.transform(
        test_no_stem_stop.data)

    tfid_u_train_no_stem_no_stop = tfid_u_no_stem_no_stop.fit_transform(
        train_no_stem_no_stop.data)
    tfid_u_test_no_stem_no_stop = tfid_u_no_stem_no_stop.transform(
        test_no_stem_no_stop.data)

    count_u_train_stem_stop = count_u_stem_stop.fit_transform(
        train_stem_stop.data)
    count_u_test_stem_stop = count_u_stem_stop.transform(test_stem_stop.data)

    count_u_train_stem_no_stop = count_u_stem_no_stop.fit_transform(
        train_stem_no_stop.data)
    count_u_test_stem_no_stop = count_u_stem_no_stop.transform(
        test_stem_no_stop.data)

    count_u_train_no_stem_stop = count_u_no_stem_stop.fit_transform(
        train_no_stem_stop.data)
    count_u_test_no_stem_stop = count_u_no_stem_stop.transform(
        test_no_stem_stop.data)

    count_u_train_no_stem_no_stop = count_u_no_stem_no_stop.fit_transform(
        train_no_stem_no_stop.data)
    count_u_test_no_stem_no_stop = count_u_no_stem_no_stop.transform(
        test_no_stem_no_stop.data)

    # Vectorize data
    res = []

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; no stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(count_u_train_no_stem_no_stop, train_no_stem_no_stop.target)
    pred = clf.predict(count_u_test_no_stem_no_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_no_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; tfidfvectorizer; no stemmer; no stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(tfid_u_train_no_stem_no_stop, train_no_stem_no_stop.target)
    pred = clf.predict(tfid_u_test_no_stem_no_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_no_stop.target
    })

    name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; no stemmer; no stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(tfid_u_train_no_stem_no_stop, train_no_stem_no_stop.target)
    pred = clf.predict(tfid_u_test_no_stem_no_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_no_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; stemmer; no stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(count_u_train_stem_no_stop, train_stem_no_stop.target)
    pred = clf.predict(count_u_test_stem_no_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_stem_no_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(count_u_train_no_stem_stop, train_no_stem_stop.target)
    pred = clf.predict(count_u_test_no_stem_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; stemmer; stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(count_u_train_stem_stop, train_stem_stop.target)
    pred = clf.predict(count_u_test_stem_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; no stemmer; stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(tfid_u_train_no_stem_stop, train_no_stem_stop.target)
    pred = clf.predict(tfid_u_test_no_stem_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; stemmer; stopper; no feature selection'
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(tfid_u_train_stem_stop, train_stem_stop.target)
    pred = clf.predict(tfid_u_test_stem_stop)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; feature selection - SelectPercentile=80'
    ch2 = SelectPercentile(chi2, percentile=80)
    X_train = ch2.fit_transform(count_u_train_no_stem_stop,
                                train_no_stem_stop.target)
    X_test = ch2.transform(count_u_test_no_stem_stop)
    clf = MultinomialNB(alpha=0.01)
    clf.fit(X_train, train_no_stem_stop.target)
    pred = clf.predict(X_test)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfifdfvectorizer; stemmer; stopper; feature selection - SelectPercentile=80'
    ch2 = SelectPercentile(chi2, percentile=80)
    X_train = ch2.fit_transform(tfid_u_train_stem_stop, train_stem_stop.target)
    X_test = ch2.transform(tfid_u_test_stem_stop)
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(X_train, train_stem_stop.target)
    pred = clf.predict(X_test)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; feature selection - SelectFromModel threshold=39'
    clf = MultinomialNB(alpha=0.01)
    clf.fit(count_u_train_no_stem_stop, train_no_stem_stop.target)
    model = SelectFromModel(clf, threshold=30, prefit=True)
    X_new = model.transform(count_u_train_no_stem_stop)
    X_new_test = model.transform(count_u_test_no_stem_stop)
    clf = MultinomialNB(alpha=0.01)
    clf.fit(X_new, train_no_stem_stop.target)
    pred = clf.predict(X_new_test)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_no_stem_stop.target
    })

    name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; stemmer; stopper; feature selection - SelectFromModel threshold=30'
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(tfid_u_train_stem_stop, train_stem_stop.target)
    model = SelectFromModel(clf, threshold=30, prefit=True)
    X_new_train = model.transform(tfid_u_train_stem_stop)
    X_new_test = model.transform(tfid_u_test_stem_stop)
    clf = MultinomialNB(alpha=0.01, fit_prior=False)
    clf.fit(X_new_train, train_stem_stop.target)
    pred = clf.predict(X_new_test)
    res.append({
        'name': name,
        'clf': clf,
        'pred': pred,
        'target': test_stem_stop.target
    })
    # This is the best model
    best_model = name

    rows = []
    headers = [
        'No.', 'Name', 'Precision', 'Recall', 'Precision/Recall', 'F1 Score'
    ]
    for i, val in enumerate(res):
        precision = metrics.precision_score(val['target'],
                                            val['pred'],
                                            average='macro')
        recall = metrics.recall_score(val['target'],
                                      val['pred'],
                                      average='macro')
        f1_score = metrics.f1_score(val['target'],
                                    val['pred'],
                                    average='macro')
        row = [
            i + 1, val['name'], precision, recall, precision / recall, f1_score
        ]
        rows.append(row)
    print('\nResults - ')
    print tabulate(rows, headers, tablefmt='orgtbl')
    print('\n')

    print 'Best model is ->', best_model
Пример #40
0
 def feature_select(self):
     feature_select = SelectPercentile(chi2, percentile=95)
     feature_select.fit(self.train_x, self.train_y)
     train_csr = feature_select.transform(self.train_x)
     predict_csr = feature_select.transform(self.test_x)
     return train_csr, predict_csr
    def process(self, df):
        print('process:', self.name)
        # 1). create strings based on text
        train = df[df['type'] == 'train']
        print(train.head())

        test = df[df['type'] == 'test']
        print(test.head())
        print('train.shape:', train.shape)

        n_train = train.shape[0]
        print('n_train:', n_train)
        n_test = test.shape[0]
        print('n_test:', n_test)

        # 2). fit a TfidfVectorizer on text
        vec_text = TfidfVectorizer(analyzer='char', ngram_range=(1, 2),
                                   max_df=0.8, min_df=2, sublinear_tf=True)
        text_tfidf = vec_text.fit_transform(df['text'].tolist())
        print('text Tfidf.shape:', text_tfidf.shape)
        vocabulary = vec_text.vocabulary_
        print('vocabulary size:%d' % len(vocabulary))
        print('vocabulary list:')
        count = 0
        for k, v in vocabulary.items():
            if count < 10:
                print("%s\t%s" % (k, v))
                count += 1

        print("feature set nums: ", len(vocabulary))
        feature_names = vec_text.get_feature_names()

        ch2_precent = SelectPercentile(chi2, percentile=5)
        ch2 = ch2_precent.fit(text_tfidf[:n_train], df.iloc[:n_train]['label'])
        text_tfidf = ch2_precent.transform(text_tfidf)

        features = [feature_names[i] for i in ch2.get_support(indices=True)]
        feature_scores = [ch2.scores_[i] for i in ch2.get_support(indices=True)]
        sorted_feature = sorted(zip(features, feature_scores), key=lambda x: x[1], reverse=True)
        feature_output_file = config.output_dir + 'char_tfidf_feature.txt'
        with open(feature_output_file, "w", encoding="utf-8") as f:
            for id, item in enumerate(sorted_feature):
                f.write("\t".join([str(id + 1), item[0], str(item[1])]) + "\n")
        print("feature select done, new feature set num: ", len(feature_scores))

        # save train and test into separate files
        tfidf_train = text_tfidf[:n_train, :]
        tfidf_train_feature_path = config.output_dir + "train.text.char.tfidf.pkl"
        with open(tfidf_train_feature_path, "wb") as f:
            pickle.dump(tfidf_train, f)
        print('text tfidf features of training set saved in %s' % tfidf_train_feature_path)

        tfidf_test = None
        if n_test > 0:
            # test set is available
            tfidf_test = text_tfidf[n_train:, :]
            tfidf_test_feature_path = config.output_dir + "test.text.char.tfidf.pkl"
            with open(tfidf_test_feature_path, "wb") as f:
                pickle.dump(tfidf_test, f)
            print('text tfidf features of test set saved in %s' % tfidf_test_feature_path)
        return tfidf_train.toarray(), tfidf_test.toarray(), train['label'].values
                 ]
data_frame = data_frame.fillna(0)

# Store to my_dataset for easy export below.
my_dataset = data_frame.to_dict('index')
print("")
print("New data_dict:\n", my_dataset)

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=50)
selector.fit(features, labels)
features_train = selector.transform(features)
features_test = selector.transform(features)

SelectPercentile_features = zip(selector.get_support(), features_list[1:], selector.scores_)
SelectPercentile_features = sorted(SelectPercentile_features, key=lambda x: x[2], reverse=True)
print ("(Features marked with 'True' are used in the final algorithm.):")
for feature in SelectPercentile_features:
    print(feature)

# Task 4: Try a variety of classifiers
# Provided to give you a starting point. Try a variety of classifiers.
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Task 5: Tune your classifier to achieve better than .3 precision and recall using our testing script.
clf = GradientBoostingClassifier(init=None,
Пример #43
0
def lightgbm_make_submission():
    train, test = read_csv()
    # x_train, x_test, y_train, y_test = make_train_set()
    x_train, y_train = make_train_set(train)
    # df = pd.merge(x_train, y_train, on='TERMINALNO')
    # if PREDICT == False:
    #     df.corr().to_csv("./data/corr.csv")
    # else:
    #     print(df.corr())
    # del df
    # gc.collect()
    x_test, y_test = make_train_set(test)
    y_train = y_train['Y']
    # feature selection
    sel = SelectPercentile(f_regression, 50)
    x_train = sel.fit_transform(x_train, y_train)
    x_test = sel.transform(x_test)
    # print("**********************x_train*******************")
    # print(x_train)
    # print("**********************x_train end***************")
    # print("**********************x_test********************")
    # print(x_test.head())
    # print("**********************x_test end****************")
    train_x, valid_x, train_y, valid_y = train_test_split(x_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=0)
    params = {
        # 'boosting': 'dart',
        'learning_rate': 0.01,
        'application': 'regression',
        'max_depth': -1,
        'num_leaves': 5,
        'verbosity': -1,
        'feature_fraction': 0.8,
        'feature_fraction_seed': 9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'bagging_seed': 9,
        'min_data_in_leaf': 6,
        'min_sum_hessian_in_leaf': 11,
        'metric': 'mae',
    }
    d_train = lgb.Dataset(train_x, label=train_y)
    d_valid = lgb.Dataset(valid_x, label=valid_y)
    watchlist = [d_train, d_valid]
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=300,
                      valid_sets=watchlist,
                      verbose_eval=20)
    # if PREDICT:
    print(
        "*******************************start predict***************************"
    )
    preds = model.predict(x_test)
    y_test['Y'] = preds
    print(y_test['Y'].var())
    y_test.columns = ['TERMINALNO', 'Pred']
    y_test.set_index('TERMINALNO', inplace=True)
    y_test.to_csv(path_test_out,
                  columns=['Pred'],
                  index=True,
                  index_label=['Id'])
Пример #44
0
print(df.dtypes)  #datatype of the columns

##instantiate the predictor variables & the target variable (with known classes encoded in numbers)
dataset = df.values  #returns a numpy array format of the dataset
x = dataset[:,
            0:7]  #each column from 1 - 7 contains the predictor variables (x)
y = dataset[:,
            7]  #the last column 8 contains the target variable which is the "class" of the localisation site (y)

#(Optional) feature engineering: automatic feature selection to reduce dimensionality
#(I)Univeriate statistics method by SelectPercentile
select = SelectPercentile(
    percentile=50
)  #(B) this automatically selected half of the features: lip, alm1, alm2
select.fit(x, y)
x_selected_bypercent = select.transform(x)

x_chosen1 = select.get_support(
)  #this shows the 3 features selected from the 7 possible
print(x_chosen1)
print(x_selected_bypercent
      )  #the 3 automatically selected features to be used for the train data

#(II)Recursive Feature Elimination - to keep the most important features, important for some Machine learning algorithms.
print(
    df.corr(method='pearson')
)  #other options: 'kendall', 'spearman'. highest correlation in descending order: mcg, gvh, alm1, lip, aac, chg, alm2.
estimator = SVR(
    kernel="linear"
)  #Choose the model it is appropriate for: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’.
select = RFE(estimator, n_features_to_select=3,
Пример #45
0
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test)

feature_names = vectorizer.get_feature_names()

print(features_train.shape)

print(features_test.shape)

#Feature selection

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(features_train, labels_train)

features_train_transformed = selector.transform(features_train)
features_test_transformed = selector.transform(features_test)

features_train_transformed.shape

print("No of features after selection :", features_train_transformed.shape[1])

#Using MultinomialNB

clf = MultinomialNB()

grid_param = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 10]}

grid_search = GridSearchCV(estimator=clf,
                           param_grid=grid_param,
                           cv=5,
optimum_complexity = complexities[np.argmax(uar_scores)]
print('\nOptimum complexity: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format(
    optimum_complexity,
    np.max(uar_scores) * 100))

if not feat_selection:
    clf = svm.LinearSVC(C=optimum_complexity, random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
else:
    uar = []
    percentile = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    for p in percentile:
        selection = SelectPercentile(f_classif, percentile=p)
        feat_selected = selection.fit_transform(X_train, y_train)
        feat_devel = selection.transform(X_devel)
        print('\nComplexity {0:.6f}'.format(optimum_complexity))
        #clf = svm.LinearSVC(C=optimum_complexity, random_state=0)
        clf = svm.SVC(C=optimum_complexity, kernel='linear', random_state=0)
        clf.fit(feat_selected, y_train)
        y_pred = clf.predict(feat_devel)
        uar.append(
            recall_score(y_devel, y_pred, labels=classes, average='macro'))
        print('UAR on Devel {0:.1f}'.format(uar[-1] * 100))
        if show_confusion:
            print('Confusion matrix (Devel):')
            print(classes)
            print(confusion_matrix(y_devel, y_pred, labels=classes))
    optimum_percentile = percentile[np.argmax(uar)]
    print(
        '\nOptimum percentile: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format(
Пример #47
0
def col_filter(mtx_train, y_train, mtx_test, func = chi2, percentile = 90):
    feature_select = SelectPercentile(func, percentile = percentile)
    feature_select.fit(mtx_train, y_train)
    mtx_train = feature_select.transform(mtx_train)
    mtx_test = feature_select.transform(mtx_test)
    return mtx_train, mtx_test
Пример #48
0
                                        "_train.npy")
            X_test = load_numpy_matrix(feature_set_path +
                                       "Google_TfidfFeatures" + tag +
                                       "_test.npy")

        print "\nFeatures", FEATURES[featureV]
        print '\nTotal:', X_train.shape[0] + X_test.shape[0]
        print 'Features:', X_train.shape[1]
        print "\nClass distribution", Counter(y_train)
        print "\nClass distribution", Counter(y_test)

        # FEATURE SELECT
        if featureV == 0:
            selector = SelectPercentile(score_func=f_classif,
                                        percentile=perc).fit(X_train, y_train)
            X_train = selector.transform(X_train)
            X_test = selector.transform(X_test)
        elif featureV < 2:
            selector = SelectKBest(
                score_func=chi2,
                k=min(200000, int(X_train.shape[1] * (perc / 100.0)))).fit(
                    X_train, y_train)
            X_train = selector.transform(X_train)
            X_test = selector.transform(X_test)

        print X_train.shape
        print X_test.shape

        # FEATURE SCALING
        if featureV == 0:
            scaler = preprocessing.StandardScaler().fit(X_train)
Пример #49
0
            'csr', 'bool')
    print('cv prepared !')

    sparse.save_npz(path + '/feature/base_train_csr.npz', base_train_csr)
    sparse.save_npz(path + '/feature/base_predict_csr.npz', base_predict_csr)

train_csr = sparse.hstack(
    (sparse.csr_matrix(train_x[num_feature]), base_train_csr),
    'csr').astype('float32')
predict_csr = sparse.hstack(
    (sparse.csr_matrix(predict_x[num_feature]), base_predict_csr),
    'csr').astype('float32')
print(train_csr.shape)
feature_select = SelectPercentile(chi2, percentile=50)
feature_select.fit(train_csr, train_y)
train_csr = feature_select.transform(train_csr)
predict_csr = feature_select.transform(predict_csr)
print('feature select')
print(train_csr.shape)

n = 1500
data_col = pd.read_csv('col_sort_one11.csv', header=None)
col = data_col[0].values.copy()

lgb_model = lgb.LGBMClassifier(boosting_type='gbdt',
                               num_leaves=60,
                               max_depth=-1,
                               learning_rate=0.1,
                               n_estimators=n,
                               max_bin=425,
                               subsample_for_bin=50000,
Пример #50
0
def cross(train_path, test_path, select_feat):

    print('Read train and test')
    train = pd.read_csv("./data/train_no_events.csv",
                        dtype={'device_id': np.str})
    train.drop(['age', 'gender'], axis=1, inplace=True)
    train_label = train["group"]
    lable_group = LabelEncoder()
    train_label = lable_group.fit_transform(train_label)

    test = pd.read_csv("./data/test_no_events.csv",
                       dtype={'device_id': np.str})
    test["group"] = np.nan

    trf = open(train_path, 'rb')
    train_sp = pickle.load(trf)
    trf.close()

    ttf = open(test_path, 'rb')
    test_sp = pickle.load(ttf)
    ttf.close()

    train_sp = train_sp.toarray()

    if select_feat == "1":
        X_train, X_val, y_train, y_val = train_test_split(train_sp,
                                                          train_label,
                                                          train_size=.90,
                                                          random_state=10)
        print X_train.shape
        print train.shape
        print X_val.shape
        print("# Feature Selection")
        selector = SelectPercentile(f_classif, percentile=100)

        selector.fit(X_train, y_train)

        X_train = selector.transform(X_train)
        X_val = selector.transform(X_val)
        print X_train.shape
        print X_val.shape
        train_sp = selector.transform(train_sp)
        test_sp = selector.transform(test_sp)

        dtrain = xgb.DMatrix(X_train, y_train)
        dvalid = xgb.DMatrix(X_val, y_val)
        #dtrain = xgb.DMatrix(train_sp, train_label)

        params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "booster": "gblinear",
            "eval_metric": "mlogloss",
            "eta": 0.05,
            "silent": 1,
            "lambda": 3,
            "alpha": 2,
        }

        params2 = {
            "objective": "multi:softprob",
            "num_class": 12,
            "booster": "gbtree",
            "eval_metric": "mlogloss",
            "eta": 0.05,
            "max_depth": 6,
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "num_parallel_tree": 1,
            "seed": 114,
            "silent": 1,
        }

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        gbm = xgb.train(params2,
                        dtrain,
                        1000,
                        evals=watchlist,
                        early_stopping_rounds=10,
                        verbose_eval=True)

    else:
        selector = SelectPercentile(f_classif, percentile=100)

        selector.fit(train_sp, train_label)

        #X_train = selector.transform(X_train)
        #X_val = selector.transform(X_val)

        train_sp = selector.transform(train_sp)
        test_sp = selector.transform(test_sp)

        #dtrain = xgb.DMatrix(X_train, y_train)
        #dvalid = xgb.DMatrix(X_val, y_val)
        dtrain = xgb.DMatrix(train_sp, train_label)
        dtest = xgb.DMatrix(test_sp)

        params = {
            "objective": "multi:softprob",
            "num_class": 12,
            "booster": "gbtree",
            "eval_metric": "mlogloss",
            "eta": 0.05,
            "max_depth": 8,
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "num_parallel_tree": 1,
            "seed": 114,
            "silent": 1,
        }

        params2 = {
            "objective": "multi:softprob",
            "num_class": 12,
            "booster": "gblinear",
            "max_depth": 6,
            "eval_metric": "mlogloss",
            "eta": 0.05,
            "silent": 1,
            "lambda": 3,
            "alpha": 2,
        }

        res = xgb.cv(params2,
                     dtrain,
                     num_boost_round=700,
                     nfold=5,
                     callbacks=[
                         xgb.callback.print_evaluation(show_stdv=False),
                         xgb.callback.early_stop(3)
                     ])
        print(res)
Пример #51
0
    def train_classifier_use_feature_selection(self):

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func,
                                    percentile=_percentile)

        print("Fitting data with feature selection ...")
        selector.fit(x_train, y_train)

        # get how many features are left after feature selection
        x_features = selector.transform(x_train)

        print("Shape of array after feature selection is " +
              str(x_features.shape))

        clf = SGDClassifier(loss=_loss,
                            penalty=_penalty,
                            alpha=_alpha,
                            n_iter=_n_iter,
                            random_state=42).fit(x_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 x_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ####################
        # test clf on test data
        ####################

        # apply feature selection on test data too

        x_test_selector = selector.transform(x_test)

        print("Shape of array for test data after feature selection is " +
              str(x_test_selector.shape))

        y_predicted = clf.predict(x_test_selector)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(x_test_selector, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf
Пример #52
0
cancer = load_breast_cancer()

import numpy as np  #노이즈 생성해서 추가

rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise,
                                                    cancer.target,
                                                    random_state=0,
                                                    test_size=0.5)

select = SelectPercentile(percentile=50)  #특성의 50%만 선택
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)

print("X_train.shape : {}".format(X_train.shape))
print("X_train_selected.shape : {}".format(X_train_selected.shape))
mask = select.get_support()  #선택된거 안된거 표시
print(mask)  #대체로 원본데이터가 선택됨
import matplotlib.pyplot as plt

plt.matshow(mask.reshape(1, -1), cmap="gray")  #흰색 선택 O, 검은색 선택 x
plt.xlabel("feature num")

#전체와 선택 성능 비교
from sklearn.linear_model import LogisticRegression

X_test_selected = select.transform(X_test)
Пример #53
0
class LocalRegression(object):
    """implements a scikitlearn model that finds nearest geographic neighbours and computes a regression. Defaults
    to a global regression if location data is not available.
   
    
    methods:
        fit( data, response, location_data )
        predict( data, location_data)
    
    """
    def __init__(self,
                 k=200,
                 feature_selection=False,
                 regressor=LinearRegression,
                 verbose=False,
                 params={},
                 response_f=identity,
                 inv_response_f=identity):
        self.k = k
        self.response_f = response_f
        self.inv_response_f = inv_response_f
        self.zero_coef_ = np.zeros(10000)
        self.verbose = verbose
        self.feature_selection = feature_selection
        self.selector = SelectPercentile(f_regression, 50)
        #if regressor is a list of regressor, we need to initialize all of them
        if type(regressor) == list:
            self.regressor = [
                self.regressor[i](**params[i]) for i in range(len(regressor))
            ]
        else:
            self.regressor = [regressor(**params)]

    def __str__(self):
        return "%dK%sLocalRegression" % (
            self.k, self.regressor.__name__.replace(" ", ""))

    def fit(self, data, response, location_data):
        try:
            #incase I pass in a numpy array or pandas df
            self.data_ = data.values
        except:
            self.data_ = data

        try:
            self.response_ = response.values
        except:
            self.response_ = response

        try:
            self.location_data_ = location_data.values
        except:
            self.location_data_ = location_data

        self.gnn = geoNN.GeoNNFinder(self.location_data_)

        return self

    def predict(self, data, location_data, weights=None):

        if location_data.shape[0] != data.shape[0]:
            raise Exception(
                "length of first argument does not equal length of second argument."
            )

        n = location_data.shape[0]
        try:
            data = data.values
        except:
            pass
        #reg = self.regressor(**self.params)
        prediction = np.zeros(n)
        try:
            location_data = location_data.values
        except:
            pass
#weights = weights.values
#argweights_sorted = np.argsort( weights )
        for i in range(n):
            if (self.verbose and i % 100 == 0):
                print i
            #how many estimators should we make, proptional to the percentile of the weight.
            #naive scheme:
            #n_estimators = self.trivial_n_estimators( i, argweights_sorted)
            location = location_data[i, :]
            #sub_predictions = np.zeros( n_estimators)
            #for n_est in range(n_estimators):
            if np.any(pandas.isnull(location)):
                prediction[i] = self.response_.mean()
            else:
                inx = self.gnn.find(location[0], location[1], self.k)
                sub_data = self.data_[inx, :]
                sub_response = self.response_f(self.response_[inx, :])
                to_predict = data[i, :]
                if self.feature_selection:
                    sub_data = self.selector.fit_transform(
                        self.data_[inx, :], sub_response)
                    to_predict = self.selector.transform(data[i, :])
                for reg in self.regressor:
                    reg.fit(sub_data, sub_response)
                try:
                    self.zero_coef_[np.nonzero(
                        abs(reg.coef_) < .000001)[0]] += 1
                except:
                    pass
                #take the average
                prediction[i] = np.array([
                    reg.predict(to_predict) for reg in self.regressor
                ]).mean()

            #prediction[i] = sub_predictions.mean()
        #make sure everything is inside [0-100]
        prediction = self.inv_response_f(prediction)
        prediction[prediction > 100] = 99
        prediction[prediction < 0] = 4
        return prediction

    def naive_n_estimators(self, i, argweights_sorted):
        """returns the deci-percentile plus 1, i.e. if the weight is the 86th percentile, return 9"""
        n = int(float(argweights_sorted[i]) / len(argweights_sorted) * 10) + 1
        return n

    def trivial_n_estimators(self, i, argweights_sorted):
        return 1
Пример #54
0
print(data_frame)
numpy.seterr(divide='ignore', invalid='ignore')

# Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(data_frame,
                                                    target,
                                                    random_state=0)
print(f'\nTrain data shape: {x_train.shape}')
print(f'Test data shape{x_test.shape}')
print(f'Target shape {y_test.shape}')

# Feature Selection
selection = SelectPercentile(percentile=50)
selection.fit(x_train, y_train)
x_train_compressed = selection.transform(x_train)
print(f'\nTrain shape after selection: {x_train_compressed.shape}')
selection_status = list(selection.get_support())
print(f'Selection Status: {selection_status} Length: {len(selection_status)}')
x_test_compressed = selection.transform(x_test)

# Printing Selected Column Names
i = 0
selected_columns = []
for status in selection_status:
    if status:
        selected_columns.append(data_column_names[i])
    i += 1
print(f'Selected Columns: {selected_columns} Length: {len(selected_columns)}')

# Applying Linear Regression
Пример #55
0
# In[40]:


#Accuracy over test set after training for all the features
accuracy = clf.score(test.toarray(), np.asarray(y_test.flatten(), dtype=np.int64))
print("Accuracy of test set: ", accuracy)


# In[56]:


#Selecting the top-p percentile features 
p = 0.1 
select = SelectPercentile(f_classif, percentile=p)
select.fit(train, y_train)
train_select = select.transform(train)


# In[59]:


#Batch wise training of top-p percentile features
n = len(corpus)
batch = 1000
clf2 = GaussianNB()
test_select = select.transform(test)

start = time.time()
for i in range(int(n/batch)):
    s = i*batch
    e = (i+1)*batch
Пример #56
0
        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
        edgecolor='black')

# #############################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, y)

svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()

plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
        color='navy', edgecolor='black')

clf_selected = svm.SVC(kernel='linear')
clf_selected.fit(selector.transform(X), y)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()

plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
        width=.2, label='SVM weights after selection', color='c',
        edgecolor='black')


plt.title("Comparing feature selection")
plt.xlabel('Feature number')
plt.yticks(())
plt.axis('tight')
plt.legend(loc='upper right')
plt.show()
Пример #57
0
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import cross_val_score
import pylab as pl
# 利用 5折CV法 在训练集上对合适的特征选择量进行验证
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
    fs = SelectPercentile(score_func=chi2, percentile=i)
    x_train_fs = fs.fit_transform(x_train, y_train)
    # 由于是5折验证, 所以输出score时是5个
    scores = cross_val_score(dt, x_train_fs, y_train, cv=5)
    results.append(scores.mean())
results = np.array(list(map(lambda x: round(x, 4), results)))
print(results)
print("the Optimal Number of Features is %d" % (percentiles[results.argmax()]))

pl.plot(percentiles, results)
pl.xlabel("percentile of features")
pl.ylabel("acc")
pl.show()

# 利用得到的最优参数重新训练,并对测试集进行预测
fs = SelectPercentile(score_func=chi2,
                      percentile=percentiles[results.argmax()])
x_train_fs = fs.fit_transform(x_train, y_train)
selectedFeatures = np.array(vec.feature_names_)[fs.get_support()]
dt.fit(x_train_fs, y_train)
x_test_fs = fs.transform(x_test)
print("the score of DT with filtering features is ",
      dt.score(x_test_fs, y_test))
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

#. ILLUSTRATING INFORMATION LEAKAGE
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

# Load and split the data
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100, ))
from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print("X_selected.shape: {}".format(X_selected.shape))
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
print("Cross-validation accuracy (cv only on ridge): {:.2f}".format(
    np.mean(cross_val_score(Ridge(), X_selected, y, cv=5))))
from sklearn.pipeline import Pipeline
pipe = Pipeline([("select",
                  SelectPercentile(score_func=f_regression, percentile=5)),
                 ("ridge", Ridge())])
print("Cross-validation accuracy (pipeline): {:.2f}".format(
    np.mean(cross_val_score(pipe, X, y, cv=5))))

#CONVENIENT PIPELINE INTERFACE WITH MAKE_PIPELINE
import numpy as np
from sklearn.svm import SVC
Пример #59
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

rng = np.random.RandomState(42)

noise = rng.normal(size=(len(cancer.data),50))

X_w_noise = np.hstack([cancer.data,noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise,cancer.target,random_state=0,test_size=0.5)

select = SelectPercentile(percentile=50)
select.fit(X_train,y_train)

X_train_selected = select.transform(X_train)

print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1),cmap="gray_r")
plt.show()
Пример #60
0
#Initialize and fit scaler
scaler = StandardScaler()
#Fit scaler using the training data
scaler.fit(X_train_raw)

#Transform the raw data
X_train_standardized = scaler.transform(X_train_raw)
X_test_standardized = scaler.transform(X_test_raw)


#Initialize and fit selector
MI_selector = SelectPercentile(mutual_info_classif, percentile=60)#Remove the lower 40%
MI_selector.fit(X_train_standardized,  y_train.values.ravel())

#Transform
X_train_MI = MI_selector.transform(X_train_standardized)
X_test_MI = MI_selector.transform(X_test_standardized)

#Summary
print("Feature Selection Results - Univariate Feature Selection")
#Summary
print("Filter Result:")
print("Number of features: ",X_train_MI.shape[1])


#Rank the features by scores
plt.figure(figsize=(10, 8), dpi= 60)
feat_scores = pd.Series(MI_selector.scores_, index=X_train_raw.columns)
top_feat = feat_scores.nlargest(10)
top_feat.plot(kind='barh')
plt.title("Feature Ranking by Mutual Information Score")