예제 #1
0
def getData(train_path, test_path, typenum=0):
   start_time = time.time()
   train1, train2, train3 = dp.form_matrix(train_path, type=typenum)
   test1, test2, test3 = dp.form_matrix(test_path, type=typenum)
   
   ## dataset 1
   train_X1 = np.array([ row[1][0:1]+row[1][2:]  for row in train1])
   test_X1 = np.array([ row[1][0:1]+row[1][2:]  for row in test1])
   train_y = np.array([ row[0]  for row in train1])
   test_y = np.array([ row[0]  for row in test1])
   print("---Finish loading the first data")

   ## dataset 2
   train_X2 = []
   test_X2 = []
   for item in train2:
      dic = {}
      for tag in item[1]:
         dic[tag] = 1 if  tag not in dic else dic[tag]+1
      train_X2.append(dic)

   for item in test2:
      dic = {}
      for tag in item[1]:
         dic[tag] = 1 if  tag not in dic else dic[tag]+1
      test_X2.append(dic)

   dicVectorizer = DictVectorizer(sparse=False)
   train_X2 = dicVectorizer.fit_transform(train_X2)
   test_X2 = dicVectorizer.transform(test_X2)
   print("---Finish loading the second data")
   ## dataset 3
   train_text, train_y3 = getTextAndLabel(train3)
   test_text, test_y3 = getTextAndLabel(test3)
   vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer(), ngram_range=(1,2))

   vectorizer.fit(train_text)
   train_X3 = vectorizer.transform(train_text)
   test_X3 = vectorizer.transform(test_text)
   print train_X3.shape
   ch2, train_X3, test_X3 = fs.chisq(train_X3, train_y3, test_X3, 20000)
   train_Xtot = np.hstack((train_X1, train_X2, train_X3.toarray()))
   test_Xtot = np.hstack((test_X1, test_X2, test_X3.toarray()))
   print("---Finish loading the third data")
   print("Finish loading data : --- %s seconds ---" % (time.time() - start_time))

   train_X = [train_X1, train_X2, train_X3, train_Xtot]
   test_X = [test_X1, test_X2, test_X3, test_Xtot]
   return train_X, test_X, train_y, test_y
예제 #2
0
class LemmaTokenizer(object):
	def __init__(self):
		self.wnl = WordNetLemmatizer()
	def __call__(self, doc):
		return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



# train: category, text
# text: id, text

train_path = "./data/data2/train.json"
test_path = "./data/data2/test.json"

train = dp.form_matrix(train_path, type=1)
test = dp.form_matrix(test_path, type=1)


train_text, train_y = misc.getTextAndLabel(train)
test_text, test_y = misc.getTextAndLabel(test)



##### vectorization
# vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', sublinear_tf=True)
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer())
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, sublinear_tf=True, tokenizer=LemmaTokenizer(), ngram_range=(1,2))
vectorizer.fit(train_text)
train_X = vectorizer.transform(train_text)
test_X = vectorizer.transform(test_text)
예제 #3
0
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


# train: category, text
# text: id, text

train_path = "./data/data2/train.json"
test_path = "./data/data2/test.json"

train = dp.form_matrix(train_path, type=2)
test = dp.form_matrix(test_path, type=2)

train1 = [[row[0]] + row[1] for row in train]

writer = csv.writer(open("train1.csv", "wb"))
writer.writerows(train1)

test1 = [[row[0]] + row[1] for row in test]
writer = csv.writer(open("test1.csv", "wb"))
writer.writerows(test1)

train_text, train_y = misc.getTextAndLabel(train)
test_text, test_y = misc.getTextAndLabel(test)

for i, item in enumerate(train):
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn import svm
import numpy as np
import csv
import data_process as dp
from scipy import sparse
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer

train_path = "./data/train.json"
test_path = "./data/test.json"

train1, train2, trian3 = dp.form_matrix(train_path, type=0)
teat1, test2, test3 = dp.form_matrix(test_path, type=0)

train_X1 = [ row[1][0:1]+row[1][2:]  for row in train1]
train_y = [ row[0]  for row in train1]
test_X1 = [ row[1][0:1]+row[1][2:]  for row in test1]
test_y1 = [ row[0]  for row in test1]


train_X2 = []
for item in train2:
    dic = {}
    for tag in item[1]:
        if tag not in dic:
            dic[tag] = 1
        else:
예제 #5
0
import data_process as dp
import json
import random
import numpy
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

mtx = dp.form_matrix('./data/train.json', type=2)
X_train = []
y_train = []
for item in mtx:
    dic = {}
    for tag in item[1]:
        if tag not in dic:
            dic[tag] = 1
        else:
            dic[tag] += 1
    X_train.append(dic)
    y_train.append(item[0])

v = DictVectorizer(sparse=False)
X_train = v.fit_transform(X_train)

#############SVM : 0.71
예제 #6
0
import data_process as dp
import json
import random
import numpy
import time
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


mtx = dp.form_matrix('./data/train.json', type = 2)
X_train = []
y_train = []
for item in mtx:
    dic = {}
    for tag in item[1]:
        if tag not in dic:
            dic[tag] = 1
        else:
            dic[tag] += 1
    X_train.append(dic)
    y_train.append(item[0])
    
v = DictVectorizer(sparse=False)
X_train = v.fit_transform(X_train)