def getData(train_path, test_path, typenum=0): start_time = time.time() train1, train2, train3 = dp.form_matrix(train_path, type=typenum) test1, test2, test3 = dp.form_matrix(test_path, type=typenum) ## dataset 1 train_X1 = np.array([ row[1][0:1]+row[1][2:] for row in train1]) test_X1 = np.array([ row[1][0:1]+row[1][2:] for row in test1]) train_y = np.array([ row[0] for row in train1]) test_y = np.array([ row[0] for row in test1]) print("---Finish loading the first data") ## dataset 2 train_X2 = [] test_X2 = [] for item in train2: dic = {} for tag in item[1]: dic[tag] = 1 if tag not in dic else dic[tag]+1 train_X2.append(dic) for item in test2: dic = {} for tag in item[1]: dic[tag] = 1 if tag not in dic else dic[tag]+1 test_X2.append(dic) dicVectorizer = DictVectorizer(sparse=False) train_X2 = dicVectorizer.fit_transform(train_X2) test_X2 = dicVectorizer.transform(test_X2) print("---Finish loading the second data") ## dataset 3 train_text, train_y3 = getTextAndLabel(train3) test_text, test_y3 = getTextAndLabel(test3) vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer(), ngram_range=(1,2)) vectorizer.fit(train_text) train_X3 = vectorizer.transform(train_text) test_X3 = vectorizer.transform(test_text) print train_X3.shape ch2, train_X3, test_X3 = fs.chisq(train_X3, train_y3, test_X3, 20000) train_Xtot = np.hstack((train_X1, train_X2, train_X3.toarray())) test_Xtot = np.hstack((test_X1, test_X2, test_X3.toarray())) print("---Finish loading the third data") print("Finish loading data : --- %s seconds ---" % (time.time() - start_time)) train_X = [train_X1, train_X2, train_X3, train_Xtot] test_X = [test_X1, test_X2, test_X3, test_Xtot] return train_X, test_X, train_y, test_y
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] # train: category, text # text: id, text train_path = "./data/data2/train.json" test_path = "./data/data2/test.json" train = dp.form_matrix(train_path, type=1) test = dp.form_matrix(test_path, type=1) train_text, train_y = misc.getTextAndLabel(train) test_text, test_y = misc.getTextAndLabel(test) ##### vectorization # vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', sublinear_tf=True) vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer()) vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer(), ngram_range=(1,2)) vectorizer.fit(train_text) train_X = vectorizer.transform(train_text) test_X = vectorizer.transform(test_text)
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] # train: category, text # text: id, text train_path = "./data/data2/train.json" test_path = "./data/data2/test.json" train = dp.form_matrix(train_path, type=2) test = dp.form_matrix(test_path, type=2) train1 = [[row[0]] + row[1] for row in train] writer = csv.writer(open("train1.csv", "wb")) writer.writerows(train1) test1 = [[row[0]] + row[1] for row in test] writer = csv.writer(open("test1.csv", "wb")) writer.writerows(test1) train_text, train_y = misc.getTextAndLabel(train) test_text, test_y = misc.getTextAndLabel(test) for i, item in enumerate(train):
from sklearn import cross_validation from sklearn.grid_search import GridSearchCV from sklearn.metrics import f1_score from sklearn import svm import numpy as np import csv import data_process as dp from scipy import sparse from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import Imputer train_path = "./data/train.json" test_path = "./data/test.json" train1, train2, trian3 = dp.form_matrix(train_path, type=0) teat1, test2, test3 = dp.form_matrix(test_path, type=0) train_X1 = [ row[1][0:1]+row[1][2:] for row in train1] train_y = [ row[0] for row in train1] test_X1 = [ row[1][0:1]+row[1][2:] for row in test1] test_y1 = [ row[0] for row in test1] train_X2 = [] for item in train2: dic = {} for tag in item[1]: if tag not in dic: dic[tag] = 1 else:
import data_process as dp import json import random import numpy import time from sklearn.feature_extraction import DictVectorizer from sklearn import svm from sklearn.cross_validation import cross_val_score from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression mtx = dp.form_matrix('./data/train.json', type=2) X_train = [] y_train = [] for item in mtx: dic = {} for tag in item[1]: if tag not in dic: dic[tag] = 1 else: dic[tag] += 1 X_train.append(dic) y_train.append(item[0]) v = DictVectorizer(sparse=False) X_train = v.fit_transform(X_train) #############SVM : 0.71
import data_process as dp import json import random import numpy import time from sklearn.feature_extraction import DictVectorizer from sklearn import svm from sklearn.cross_validation import cross_val_score from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import BernoulliNB from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression mtx = dp.form_matrix('./data/train.json', type = 2) X_train = [] y_train = [] for item in mtx: dic = {} for tag in item[1]: if tag not in dic: dic[tag] = 1 else: dic[tag] += 1 X_train.append(dic) y_train.append(item[0]) v = DictVectorizer(sparse=False) X_train = v.fit_transform(X_train)