Пример #1
0
def build_dict_feature_imdb(double_features):
    sentences_train = []

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_pos, '*.txt')),
                        desc="train pos"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_train.append(f.readline().strip())

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_train_neg, '*.txt')),
                        desc="train neg"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_train.append(f.readline().strip())

    sentences_test = []
    for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_pos, '*.txt')),
                        desc="test pos"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_test.append(f.readline().strip())

    for ff in tqdm.tqdm(glob.glob(os.path.join(path_test_neg, '*.txt')),
                        desc="test neg"):
        with io.open(ff, 'r', encoding='utf-8') as f:
            sentences_test.append(f.readline().strip())

    if model == "svm":
        X_train, vectorizer_fitted = build_dic_svm(sentences_train,
                                                   double_features)
        X_test, _ = build_dic_svm(sentences_test, double_features,
                                  vectorizer_fitted)
        n = X_train.shape[0] / 2
        y_train = [1] * n + [0] * n
        y_test = [1] * n + [0] * n

    elif model == "cnn" or model == "lstm":
        X_train, tokenizer = build_dic_nn(sentences=sentences_train,
                                          double_features=double_features)

        X_test, _ = build_dic_nn(sentences=sentences_test,
                                 double_features=double_features,
                                 tokenizer=tokenizer)

        n = len(X_train) / 2
        y_train = [1] * n + [0] * n
        y_test = [1] * n + [0] * n

    if feature_selection:
        print("Doing feature selection")
        if hashing_trick:
            fselect = SelectKBest(chi2, k=200000)
        else:
            if negation:
                fselect = SelectKBest(chi2, k=200000)
            else:
                fselect = SelectKBest(chi2, k=200000)

        X_train = fselect.fit_transform(X_train, y_train)

        X_test = fselect.transform(X_test)

    return X_train, X_test, y_train, y_test
Пример #2
0
def apply_feature_selection(X_train, y_train, X_test, features):
    if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest':
        clf = RandomForestClassifier()
        clf = clf.fit(X_train.toarray(), y_train)
        features_scores = [(feature, score) for (score, feature) in sorted(
            zip(clf.feature_importances_, features), reverse=True)]
        selected_features = features_scores[:CONFIG['preprocessing']
                                            ['top_features_to_select']]
        selected_indeces = np.searchsorted(features,
                                           [f[0] for f in selected_features])
        X_train = X_train[:, selected_indeces]
        X_test = X_test[:, selected_indeces]
        return X_train, y_train, X_test, selected_features
    if CONFIG['preprocessing']['use_feature_selection'] == 'chi2':
        algorithm = chi2
    elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA':
        algorithm = f_classif
    else:
        raise ValueError("No implementation for " +
                         str(CONFIG['preprocessing']['use_feature_selection']))
    feature_selector = SelectKBest(
        algorithm, k=CONFIG['preprocessing']['top_features_to_select'])
    feature_selector.fit(X_train, y_train)
    X_train = feature_selector.fit_transform(X_train, y_train)
    X_test = feature_selector.transform(X_test)
    features = [
        (feature, score)
        for (score, feature
             ) in sorted(zip(feature_selector.scores_, features), reverse=True)
    ]
    selected_features = features[:CONFIG['preprocessing']
                                 ['top_features_to_select']]
    return X_train, y_train, X_test, selected_features
Пример #3
0
def extract(max_gram, feat_dims, save_model=False):
    print "extract feature"

    vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features=None, 
            ngram_range=(1, max_gram), sublinear_tf = True )

    vectorizer = vectorizer.fit(reviews_train + reviews_unsup)
    feats_train_ori = vectorizer.transform(reviews_train)
    feats_test_ori = vectorizer.transform(reviews_test)
    print "size of orginal train features", feats_train_ori.shape

    for feat_dim in feat_dims:
        print "perform feature selection"

        fselect = SelectKBest(chi2 , k=feat_dim)
        feats_train = fselect.fit_transform(feats_train_ori, labels_train)
        feats_test = fselect.transform(feats_test_ori)

        print "save features"
        np.savez("feats/%d_%d.npz" % (max_gram, feat_dim), 
                feats_train=feats_train, feats_test=feats_test, 
                labels_train=labels_train, labels_test=labels_test)

        if save_model:
            print "save models"
            with open("models/vectorizer_%d.pkl" % max_gram, "wb") as fout:
                pickle.dump(vectorizer, fout, -1)

            with open("models/fselect_%d_%d.pkl" % (max_gram, feat_dim), "wb") as fout:
                pickle.dump(fselect, fout, -1)
Пример #4
0
def build_dict_feature_spd(double_features):
    sentences_pos = []

    ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.pos')

    with io.open(ff, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f, desc="sentences pos"):
            # time.sleep(0.001)
            sentences_pos.append(line)

    sentences_neg = []
    ff = os.path.join(dataset_path_spd, 'rt-polarity_utf8.neg')
    with io.open(ff, 'r', encoding='UTF-8') as f:
        for line in tqdm.tqdm(f, desc="sentences neg"):
            # time.sleep(0.001)
            sentences_neg.append(line)

    sentences = sentences_pos + sentences_neg

    y = [1] * (len(sentences_pos)) + [0] * (len(sentences_neg))

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.2, random_state=58)

    if model == "svm":
        X_train, vectorizer = build_dic_svm(sentences_train, double_features)
        X_test, _ = build_dic_svm(sentences_test, double_features, vectorizer)
    elif model == "cnn" or model == "lstm":
        X_train, tokenizer = build_dic_nn(sentences=sentences_train,
                                          double_features=double_features)
        X_test, _ = build_dic_nn(sentences=sentences_test,
                                 double_features=double_features,
                                 tokenizer=tokenizer)

    if feature_selection:
        print("Doing feature selection")
        if hashing_trick:
            fselect = SelectKBest(chi2, k=9500)
        else:
            if negation:
                fselect = SelectKBest(chi2, k=9500)
            else:
                fselect = SelectKBest(chi2, k=8500)

        X_train = fselect.fit_transform(X_train, y_train)

        X_test = fselect.transform(X_test)

    return X_train, X_test, y_train, y_test
Пример #5
0
def reduce_dim(vec, num_dim, method, label=None):
    """
    Dimension reduction. Two approaches are provided.
    SVD: Truncated SVD maps feature vectors into different subspaces.
    chi2: Chi-square independence test examine the pairwise dependence of features and labels
    """

    print "Performing dimension reduction"

    # Reduce the dimensions using truncated SVD or Chi-Square independence test
    if method == "SVD":
        svd = TruncatedSVD(n_components=num_dim)
        vec = svd.fit_transform(vec)
        # test = svd.transform(vec)
    elif method == "chi2" or method == "f_classif":
        fselect = SelectKBest((chi2 if method == "chi2" else f_classif), k=num_dim)
        vec = fselect.fit_transform(vec, label)
        # test = fselect.transform(vec)

    return vec
features = vectorizer.transform(opinions)
features_test = vectorizer.transform(opinions_test)


# In[13]:

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif

fselect = SelectKBest(chi2, k=10000)


# In[14]:

train_data_features = fselect.fit_transform(features, article["trend"])
test_data_features = fselect.transform(features_test)


# # Train the model

# In[128]:

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
model1.fit(train_data_features, article["trend"])

model2 = SGDClassifier(loss="modified_huber", n_iter=5, random_state=0, shuffle=True)
model2.fit(train_data_features, article["trend"])
Пример #7
0
    stemmer = english_stemmer #PorterStemmer()
    for word in txt:
        b.append(stemmer.stem(word))

    # 5. Return a list of words
    return(b)

clean_train_reviews=[]
for txt in train['text']:
    clean_train_reviews.append("".join(cleanData(txt,True,True,True)))
    
clean_test_reviews=[]
for txt in test['text']:
    clean_test_reviews.append("".join(cleanData(txt,True,True,True)))


vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ),sublinear_tf = True )
vectorizer = vectorizer.fit(clean_train_reviews)
train_features = vectorizer.transform(clean_train_reviews)
test_features = vectorizer.transform(clean_test_reviews)

fselect = SelectKBest(chi2 , k=10000)
train_features = fselect.fit_transform(train_features, train["author"])
test_features = fselect.transform(test_features)

model1 = MultinomialNB(alpha=0.001)
model1.fit( train_features, train["author"] )

pred_1 = model1.predict( test_features.toarray() )

print(pred_1)
Пример #8
0
    clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))

print "Vectorizing..."

vectorizer = TfidfVectorizer( min_df=2, max_df=0.95, max_features = 200000, ngram_range = ( 1, 4 ),
                              sublinear_tf = True )

vectorizer = vectorizer.fit(clean_train_reviews + unlabeled_clean_train_reviews)
train_data_features = vectorizer.transform( clean_train_reviews )
test_data_features = vectorizer.transform( clean_test_reviews )

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=70000)
train_data_features = fselect.fit_transform(train_data_features, train["sentiment"])
test_data_features = fselect.transform(test_data_features)

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
model1.fit( train_data_features, train["sentiment"] )

model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
model2.fit( train_data_features, train["sentiment"] )

p1 = model1.predict_proba( test_data_features )[:,1]
p2 = model2.predict_proba( test_data_features )[:,1]

print "Writing results..."
Пример #9
0
do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, 
                                 min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, 
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    selector = RFECV(estimator, step=1, cv=5, scoring='log_loss')
    X_train = selector.fit_transform(X_train, train_labels)
    print 'after feature elimination', X_train.shape
    X_test = selector.transform(X_test)
    
do_feature_selection = False
if do_feature_selection:
    ch2 = SelectKBest(chi2, k=4000)
    X_train = ch2.fit_transform(X_train, train_labels)
    X_test = ch2.transform(X_test)

do_pca = False

if do_pca:
    k = 100
    add_pca_to_original = True
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    pca = PCA(n_components=k, copy=True, whiten=False)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    if add_pca_to_original:
        X_train = np.hstack((X_train, X_train_pca))
        X_test = np.hstack((X_test, X_test_pca))
Пример #10
0
def main():

    os.chdir("/Users/[email protected]/Desktop/workspace/sentiment.analysis")


    ##################### Initialization #####################

    write_to_csv = False
    tune_parameter = False
    Mix = True

    # term_vector_type = {"TFIDF", "Binary", "Int", "Word2vec", "Word2vec_pretrained"}
    # {"TFIDF", "Int", "Binary"}: Bag-of-words model with {tf-idf, word counts, presence/absence} representation
    # {"Word2vec", "Word2vec_pretrained"}: Google word2vec representation {without, with} pre-trained models
    # Specify model_name if there's a pre-trained model to be loaded
    #vector_type = "TFIDF"
    vector_type = 'Word2vec_pretrained'

    #model_name = "selftrainBad.bin"

    model_name = "wiki.fr.vec"


    # model_type = {"bin", "reg"}
    # Specify whether pre-trained word2vec model is binary
    #model_type = "bin"
       
    # Parameters for word2vec
    # num_features need to be identical with the pre-trained model
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 5   # Minimum word count to be included for training                      
    num_workers = 4       # Number of threads to run in parallel
    context = 4         # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # training_model = {"RF", "NB", "SVM", "BT", "no"}
    training_model = "SVM"

    # feature scaling = {"standard", "signed", "unsigned", "no"}
    # Note: Scaling is needed for SVM
    scaling = "no"

    # dimension reduction = {"SVD", "chi2", "no"}
    # Note: For NB models, we cannot perform truncated SVD as it will make input negative
    # chi2 is the feature selectioin based on chi2 independence test
    dim_reduce = "no"
    num_dim = 200

    ##################### End of Initialization #####################

    print('parameter settings: ')
    print('vector_type:' + vector_type)
    print('training_model: ' + training_model)
    print('scaling: ' + scaling)
    print('dim_reduce: ' + dim_reduce )

    ########################### Main Program ###########################

    train_list = []
    test_list_t = []
    test_list_h = []
    test_list_c = []
    word2vec_input = []
    train_list2 = []
    pred = []

    language = 'french'

    train_language = 'german'
    test_language = 'french'

    trainFile = train_language + 'TrainData_100k.csv'
    trainFile2 = test_language + 'TrainData_100k.csv' ##

    testFile_t = test_language + 'TestData_cftwt.csv'
    testFile_h = test_language + 'TestData_cfdata.csv'
    testFile_c = test_language + 'TestData_deft.csv'
    #unlabFile = 'frenchUnlab.csv'

    train_data = pd.read_csv("data/" + trainFile, header=0, delimiter=",", quoting=0 )#, encoding='utf-8')
    if Mix == True:
        train_data2 = pd.read_csv("data/" + trainFile2, header=0, delimiter=",", quoting=0 )

    test_data_t = pd.read_csv("data/" + testFile_t, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
    test_data_h = pd.read_csv("data/" + testFile_h, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
    test_data_c = pd.read_csv("data/" + testFile_c, header=0, delimiter=",", quoting=0)# , encoding='utf-8')
   # unlab_train_data = pd.read_csv("data/" + unlabFile, header=0, delimiter=",", quoting=0)# , encoding='utf-8')


    if vector_type == "Word2vec":
        unlab_train_data = pd.read_csv("data/frenchUnlabeledTrainData.csv", header=0, delimiter=",", quoting=0)
        tokenizer = nltk.data.load('tokenizers/punkt/'+ language+'.pickle')
        logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO)

    ground_truth_t = test_data_t.sentiment
    ground_truth_h = test_data_h.sentiment
    ground_truth_c = test_data_c.sentiment
    # Extract words from reviews
    # xrange is faster when iterating
    if vector_type == "Word2vec" or vector_type == "Word2vec_pretrained":
        
        for i in xrange(0, len(train_data.review)):
            
            if vector_type == "Word2vec":
                # Decode utf-8 coding first
                word2vec_input.extend(review_to_doublelist(train_data.review[i].decode("utf-8"), language, tokenizer ))
                
           # print train_data.id[i]
            train_list.append(clean_review(train_data.review[i], language, output_format="list" ))
            #if i%1000 == 0:
                #print "Cleaning training review", i

        if Mix == True:
            for i in xrange(0, len(train_data2.review)):
                        
               # print train_data.id[i]
                train_list2.append(clean_review(train_data2.review[i], language, output_format="list" ))
                #if i%1000 == 0:
                    #print "Cleaning training review", i

           
        if vector_type == "Word2vec":                
            for i in xrange(0, len(unlab_train_data.review)):
                #print unlab_train_data.review[i]
                word2vec_input.extend(review_to_doublelist(unlab_train_data.review[i].decode("utf-8"), language, tokenizer))
                #if i%1000 == 0:
                    #print "Cleaning unlabeled training review", i
        
        for i in xrange(0, len(test_data_t.review)):
            test_list_t.append(clean_review(test_data_t.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i  
        for i in xrange(0, len(test_data_h.review)):
            test_list_h.append(clean_review(test_data_h.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i   
        for i in xrange(0, len(test_data_c.review)):
            test_list_c.append(clean_review(test_data_c.review[i], language, output_format="list"))
            #if i%1000 == 0:
                #print "Cleaning test review", i        

    elif vector_type != "no": 
        for i in xrange(0, len(train_data.review)):
            
            # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs
            train_list.append(clean_review(train_data.review[i], language) )
            #if i%1000 == 0:
               # print "Cleaning training review", i

        for i in xrange(0, len(test_data.review)):
            
            # Append raw texts rather than lists as Count/TFIDF vectorizers take raw texts as inputs
            test_list.append(clean_review(test_data.review[i], language))
            #if i%1000 == 0:
            #    print "Cleaning test review", i


    # Generate vectors from words
    if vector_type == "Word2vec_pretrained" or vector_type == "Word2vec":
        
        if vector_type == "Word2vec_pretrained":
            print "Loading the pre-trained model"
            if model_name.endswith == ".bin":
                #model = word2vec.Word2Vec.load_word2vec_format(model_name, binary=True)
                model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=True , unicode_errors='ignore')
            else:
                #model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=False , unicode_errors='ignore') 
                train_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ train_language +'.vec', binary=False , unicode_errors='ignore') 
                test_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.multi.'+ test_language +'.vec', binary=False , unicode_errors='ignore') 

        if vector_type == "Word2vec":
            print "Training word2vec word vectors"
            model = word2vec.Word2Vec(word2vec_input, workers=num_workers, \
                                    size=num_features, min_count = min_word_count, \
                                    window = context, sample = downsampling)
        
            # If no further training and only query is needed, this trims unnecessary memory
            model.init_sims(replace=True)
        
            # Save the model for later use
            word_vectors = model.wv
            model.save(model_name)
        
        print "Vectorizing training review"
        train_vec = gen_review_vecs(train_list, train_model, num_features)
        if Mix == True:
            train_vec2 = gen_review_vecs(train_list2, test_model, num_features)
            train_vec = np.append(train_vec , train_vec2 , axis = 0)
            #train_vec = np.concatenate((train_vec, train_vec2) , axis = 0)

        print "Vectorizing test review"
        test_vec_c = gen_review_vecs(test_list_c,test_model, num_features)
        test_vec_h = gen_review_vecs(test_list_h,test_model, num_features)
        test_vec_t = gen_review_vecs(test_list_t,test_model, num_features)
        
        
    elif vector_type != "no": 
        if vector_type == "TFIDF":
            # Unit of gram is "word", only top 5000/10000 words are extracted
            count_vec = TfidfVectorizer(analyzer="word", max_features=10000, ngram_range=(1,2), sublinear_tf=True)
            
        elif vector_type == "Binary" or vector_type == "Int":       
            count_vec = CountVectorizer(analyzer="word", max_features=10000, \
                                        binary = (vector_type == "Binary"), \
                                        ngram_range=(1,2))
        
        # Return a scipy sparse term-document matrix
        print "Vectorizing input texts"
        train_vec = count_vec.fit_transform(train_list)
        test_vec_h = count_vec.transform(test_list_h)
        test_vec_t = count_vec.transform(test_list_t)
        test_vec_c = count_vec.transform(test_list_c)


    # Dimemsion Reduction
    if dim_reduce == "SVD":
        print "Performing dimension reduction"
        svd = TruncatedSVD(n_components = num_dim)
        train_vec = svd.fit_transform(train_vec)
        test_vec_h = svd.transform(test_vec_h)
        test_vec_t = svd.transform(test_vec_t)
        test_vec_c = svd.transform(test_vec_c)
        print "Explained variance ratio =", svd.explained_variance_ratio_.sum()

    elif dim_reduce == "chi2":
        print "Performing feature selection based on chi2 independence test"
        fselect = SelectKBest(chi2, k=num_dim)
        train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
        test_vec = fselect.transform(test_vec)

    # Transform into numpy arrays
    if "numpy.ndarray" not in str(type(train_vec)):
        train_vec = train_vec.toarray()
        test_vec_h = test_vec_h.toarray()  
        test_vec_t = test_vec_t.toarray()  
        test_vec_c = test_vec_c.toarray()  


    # Feature Scaling
    if scaling != "no":

        if scaling == "standard":
            scaler = preprocessing.StandardScaler()
        else: 
            if scaling == "unsigned":
                scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
            elif scaling == "signed":
                scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
        
        print "Scaling vectors"
        train_vec = scaler.fit_transform(train_vec)
        test_vec = scaler.transform(test_vec)
        
        
    # Model training 
    if training_model == "RF" or training_model == "BT":
        
        # Initialize the Random Forest or bagged tree based the model chosen
        rfc = RFC(n_estimators = 100, oob_score = True, \
                  max_features = (None if training_model=="BT" else "auto"))
        print "Training %s" % ("Random Forest" if training_model=="RF" else "bagged tree")
        rfc = rfc.fit(train_vec, train_data.sentiment)
        print "OOB Score =", rfc.oob_score_
        pred = rfc.predict(test_vec)
        
    elif training_model == "NB":
        nb = naive_bayes.MultinomialNB()
        cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10)
        print "Training Naive Bayes"
        print "CV Score = ", cv_score.mean()
        nb = nb.fit(train_vec, train_data.sentiment)
        pred = nb.predict(test_vec)
        
    elif training_model == "SVM":
        svc = svm.LinearSVC()
        #svc = svm.SVC(kernel = 'linear', probability = True) #seems it takes so long time to train??
        print 'complete 0'
        param = {'C': [1e15,1e13,1e11,1e9,1e7,1e5,1e3,1e1,1e-1,1e-3,1e-5]}
        print "Training SVM"

        

        if tune_parameter == True:
            svc = GridSearchCV(estimator=svc, param_grid = param, cv=10)

        #next 2 Lines are for enable probability
        svc = CalibratedClassifierCV(svc)

        #print 'complete 1'

        sentiment_array = []
        for sent in train_data.sentiment:
            sentiment_array.append(sent)
        if Mix == True:
            for sent in train_data2.sentiment:
                sentiment_array.append(sent)

        svc = svc.fit(train_vec, sentiment_array)
        #svc = svc.fit(train_vec, train_data.sentiment)

        print 'complete 2'
        #pred_t = svc.predict(test_vec_t)
        #pred_h = svc.predict(test_vec_h)
        #pred_c = svc.predict(test_vec_c)

        #pred_proba_t = svc.predict_proba(test_vec_t)

        #pred1 = svc.predict_proba(test_vec)
        #print(pred1)
        #print(pred_proba_t)
        print('Accuracy on "cftwt.csv" dataset:')
        evaluate_on_testdata(test_vec_t, svc , ground_truth_t)
        print('Accuracy on "cfdata.csv" dataset:')
        evaluate_on_testdata(test_vec_h, svc , ground_truth_h)
        print('Accuracy on "deft.csv" dataset:')
        evaluate_on_testdata(test_vec_c, svc , ground_truth_c)
        print('training dataset is : ')
        if Mix:
            print "used Mixed datasets"
        print trainFile

        if tune_parameter == True:
            print "Optimized parameters:", svc.best_estimator_ #print the best parameter when using GridSearchCV
            print "Best CV score:", svc.best_score_

        #filename =vector_type+ 'finalized_model.pkl'
        #s = pickle.dump(svc, open(filename, 'wb'))
        
    # Output the results
    if write_to_csv:
        output = pd.DataFrame(data = {"id": test_data.id, "sentiment": pred})
        output.to_csv("data/" + vector_type +"submission.csv", index=False)
Пример #11
0
    clean_test_reviews.append(" ".join(review_to_wordlist(review)))

# In[ ]:

vectorizer = TfidfVectorizer(min_df=2,
                             max_df=0.95,
                             max_features=200000,
                             ngram_range=(1, 4),
                             sublinear_tf=True)

vectorizer = vectorizer.fit(clean_train_reviews)
train_features = vectorizer.transform(clean_train_reviews)

test_features = vectorizer.transform(clean_test_reviews)
fselect = SelectKBest(chi2, k=10000)
train_features = fselect.fit_transform(train_features, train["Rating"])
test_features = fselect.transform(test_features)

# # Machine learning

# In[ ]:

classifiers = [
    ('RandomForestClassifierG',
     RandomForestClassifier(n_jobs=-1, criterion='gini')),
    ('RandomForestClassifierE',
     RandomForestClassifier(n_jobs=-1, criterion='entropy')),
    ('AdaBoostClassifier', AdaBoostClassifier()),
    ('ExtraTreesClassifier', ExtraTreesClassifier(n_jobs=-1)),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('LogisticRegression', LogisticRegression()),
Пример #12
0
vectorizer = TfidfVectorizer(min_df=2,
                             max_df=0.95,
                             max_features=200000,
                             ngram_range=(1, 4),
                             sublinear_tf=True)

vectorizer = vectorizer.fit(clean_train_reviews +
                            unlabeled_clean_train_reviews)
train_data_features = vectorizer.transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)

print "Reducing dimension..."

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2, k=70000)
train_data_features = fselect.fit_transform(train_data_features,
                                            train["sentiment"])
test_data_features = fselect.transform(test_data_features)

print "Training..."

model1 = MultinomialNB(alpha=0.0005)
model1.fit(train_data_features, train["sentiment"])

model2 = SGDClassifier(loss='modified_huber',
                       n_iter=5,
                       random_state=0,
                       shuffle=True)
model2.fit(train_data_features, train["sentiment"])

p1 = model1.predict_proba(test_data_features)[:, 1]
p2 = model2.predict_proba(test_data_features)[:, 1]
Пример #13
0
    predsFM = model.predict(sparse_merge_test)
    print('[{}] Predict FM completed'.format(time.time() - start_time))
else:
    for i in range(rounds):
        model.fit(sparse_merge_train, y_train)
        predsFM = model.predict(sparse_merge_test)
        print('[{}] Iteration {}/{} -- RMSLE: {}'.format(time.time() - start_time, i + 1, rounds, rmse(predsFM, y_test)))

del model
gc.collect()
if not SUBMIT_MODE:
    print("FM_FTRL dev RMSLE:", rmse(predsFM, y_test))


fselect = SelectKBest(f_regression, k=48000)
train_features = fselect.fit_transform(sparse_merge_train, y_train)
test_features = fselect.transform(sparse_merge_test)
print('[{}] Select best completed'.format(time.time() - start_time))


del sparse_merge_train
del sparse_merge_test
gc.collect()
print('[{}] Garbage collection'.format(time.time() - start_time))


tv = TfidfVectorizer(max_features=250000,
                     ngram_range=(1, 3),
                     stop_words=None)
X_name_train = tv.fit_transform(df_train['name'])
print('[{}] Finished TFIDF vectorize `name` (1/2)'.format(time.time() - start_time))
Пример #14
0
                                       bootstrap=False,
                                       oob_score=False,
                                       n_jobs=10,
                                       random_state=None,
                                       verbose=0,
                                       warm_start=False,
                                       class_weight=None)
    selector = RFECV(estimator, step=1, cv=5, scoring='log_loss')
    X_train = selector.fit_transform(X_train, train_labels)
    print 'after feature elimination', X_train.shape
    X_test = selector.transform(X_test)

do_feature_selection = False
if do_feature_selection:
    ch2 = SelectKBest(chi2, k=4000)
    X_train = ch2.fit_transform(X_train, train_labels)
    X_test = ch2.transform(X_test)

do_pca = False

if do_pca:
    k = 100
    add_pca_to_original = True
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    pca = PCA(n_components=k, copy=True, whiten=False)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    if add_pca_to_original:
        X_train = np.hstack((X_train, X_train_pca))
        X_test = np.hstack((X_test, X_test_pca))
Пример #15
0
corpus = []
for question in df[0]:
    corpus.append(" ".join(text_to_wordlist(question, remove_stopwords=False)))

vectorizer = TfidfVectorizer(min_df=2,
                             max_df=0.95,
                             max_features=5000,
                             ngram_range=(1, 4),
                             sublinear_tf=True)

vectorizer = vectorizer.fit(corpus)

train_features = vectorizer.transform(corpus)

fselect = SelectKBest(chi2, k=1000)
train_features = fselect.fit_transform(train_features, cat)

# using XGboost to train our model
params = {
    'objective': 'multi:softmax',
    'eval_metric': 'merror',
    'eta': 0.025,
    'max_depth': 9,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 5,
    'num_class': 5,
    'silent': 1
}

import xgboost as xgb
	idx_start += N_test
 
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

print "start classification"

# vectorization
vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# feature reduction
ch2 = SelectKBest(chi2, k="all")
ch2.fit(X_train, y_train)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

# training
clf = LinearSVC()
clf.fit(X_train, y_train)

if validation_mode == "train":
	X_test = X_train
	y_test = y_train

# predict categories
predicted = clf.predict(X_test)

print numpy.mean(predicted == y_test)
print metrics.classification_report(y_test, predicted)
Пример #17
0
    train_vec = count_vec.fit_transform(train_list)
    test_vec = count_vec.transform(test_list)


# Dimemsion Reduction
if dim_reduce == "SVD":
    print "Performing dimension reduction"
    svd = TruncatedSVD(n_components = num_dim)
    train_vec = svd.fit_transform(train_vec)
    test_vec = svd.transform(test_vec)
    print "Explained variance ratio =", svd.explained_variance_ratio_.sum()

elif dim_reduce == "chi2":
    print "Performing feature selection based on chi2 independence test"
    fselect = SelectKBest(chi2, k=num_dim)
    train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
    test_vec = fselect.transform(test_vec)

# Transform into numpy arrays
if "numpy.ndarray" not in str(type(train_vec)):
    train_vec = train_vec.toarray()
    test_vec = test_vec.toarray()  


# Feature Scaling
if scaling != "no":

    if scaling == "standard":
        scaler = preprocessing.StandardScaler()
    else: 
        if scaling == "unsigned":
Пример #18
0
    test_vec = count_vec.transform(test_list)

# Dimension Reduction
print("Start Dimension Reduction...")

if dim_reduce == "SVD":
    print("Performing dimension reduction")
    svd = TruncatedSVD(n_components=num_dim)
    train_vec = svd.fit_transform(train_vec)
    test_vec = svd.transform(test_vec)
    print("Explained variance ratio =", svd.explained_variance_ratio_.sum())

elif dim_reduce == "chi2":
    print("Performing feature selection based on chi2 independence test")
    fselect = SelectKBest(chi2, k=num_dim)
    train_vec = fselect.fit_transform(train_vec, train_data_y)
    test_vec = fselect.transform(test_vec)

# Transform into numpy arrays
if "numpy.ndarray" not in str(type(train_vec)):
    train_vec = train_vec.toarray()
    test_vec = test_vec.toarray()

# Feature Scaling
if scaling != "no":

    if scaling == "standard":
        scaler = preprocessing.StandardScaler()
    else:
        if scaling == "unsigned":
            scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
Пример #19
0
    print_step('Importing Data 3/13')
    tfidf_train2, tfidf_test2 = load_cache('text_tfidf')

    print_step('Importing Data 4/13')
    tfidf_train3, tfidf_test3 = load_cache('text_char_tfidf')

    print_step('Importing Data 5/13')
    train = hstack((tfidf_train2, tfidf_train3)).tocsr()
    print_step('Importing Data 6/13')
    test = hstack((tfidf_test2, tfidf_test3)).tocsr()
    print(train.shape)
    print(test.shape)

    print_step('SelectKBest 1/2')
    fselect = SelectKBest(f_regression, k=100000)
    train = fselect.fit_transform(train, target)
    print_step('SelectKBest 2/2')
    test = fselect.transform(test)
    print(train.shape)
    print(test.shape)

    print_step('Importing Data 7/13')
    train = hstack((tfidf_train, train)).tocsr()
    print_step('Importing Data 8/13')
    test = hstack((tfidf_test, test)).tocsr()
    print(train.shape)
    print(test.shape)

    print_step('GC')
    del tfidf_test
    del tfidf_test2
Пример #20
0
def dimensionality_reduction(train_vec, test_vec, y_train_data):
    print("Performing feature selection based on chi2 independence test")
    fselect = SelectKBest(chi2, k=4500)
    train_vec = fselect.fit_transform(train_vec, y_train_data)
    test_vec = fselect.transform(test_vec)
    return train_vec, test_vec
Пример #21
0
    print("Vectorizing input texts")
    train_vec = count_vec.fit_transform(train_list)
    test_vec = count_vec.transform(test_list)

# Dimension Reduction
if dim_reduce == "SVD":
    print("performing dimension reduction")
    svd = TruncatedSVD(n_components=num_dim)
    train_vec = svd.fit_transform(train_vec)
    test_vec = svd.transform(test_vec)
    print("Explained variance ratio =", svd.explained_variance_ratio_.sum())

elif dim_reduce == "chi2":
    print("performing feature selection based on chi2 independce test")
    fselect = SelectKBest(chi2, k=num_dim)
    train_vec = fselect.fit_transform(train_vec, train_data.sentiment)
    test_vec = fselect.transform(test_vec)

# Transform into numpy arrays
if "numpy.ndarray" not in str(type(train_vec)):
    train_vec = train_vec.toarray()
    test_vec = test_vec.toarray()

# Feature Scaling
if scaling != "no":
    if scaler == "standard":
        scaler = preprocessing.StandardScaler()
    else:
        if scaling == "unsigned":
            scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
        elif scaling == "signed":
Пример #22
0
from sklearn import datasets

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2

iris = datasets.load_iris()

k_best0 = SelectKBest(score_func=chi2, k=2)
fit = k_best0.fit(iris.data, iris.target)
print(fit.scores_)

features = fit.transform(iris.data)
print(features)

k_best1 = SelectKBest(score_func=chi2, k=4)
newX = k_best1.fit_transform(iris.data, iris.target)
print(newX)