def x_2func(traindir,testdir): ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) category_tokens = get_category_tokens() token_x = x_2(train,sta,category_tokens) nb.write(token_x,'token_x') ''' category = nb.read('category_nb_eventmodel') tokens_x = nb.read('token_x') category_convert = nb.convert(category) tokens_all_x = [] x_category = {} for i in range(10): tokens = sorted(tokens_x[i],key=tokens_x[i].get,reverse = True) x_category[i] = tokens[:100] ''' x_category[i] = [] for word in tokens: if tokens_x[i][word] >10.83: x_category[i].append(word) else: break ''' print len(x_category[i]) tokens_all_x = set(tokens_all_x)|set(x_category[i]) print len(tokens_all_x) nb.write(x_category,'x_category') nb.write(tokens_all_x,'tokens_all_x')
def logistic_l1(): traindir ='./data/training' testdir = './data/test' tokens = list(nb.read('tokens')) train_x,train_y,category = vec.func2(traindir,tokens) train_x = np.array(train_x) train_y = np.array(train_y) print train_x.shape clf = LogisticRegression(penalty='l2') clf.fit(train_x,train_y) category = nb.read('category') result,test_x,test_file= vec.func3(testdir,tokens,category) test_x = np.array(test_x) print test_x.shape predict = np.array(clf.predict(test_x)) test_file = np.array(test_file) predict = np.column_stack((test_file,predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_l1(): traindir = './data/training' testdir = './data/test' tokens = list(nb.read('tokens')) train_x, train_y, category = vec.func2(traindir, tokens) train_x = np.array(train_x) train_y = np.array(train_y) print train_x.shape clf = LogisticRegression(penalty='l2') clf.fit(train_x, train_y) category = nb.read('category') result, test_x, test_file = vec.func3(testdir, tokens, category) test_x = np.array(test_x) print test_x.shape predict = np.array(clf.predict(test_x)) test_file = np.array(test_file) predict = np.column_stack((test_file, predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict, category_convert, result, path)
def logistic_own(): train_x = nb.read('train_x') train_y = nb.read('train_y') category = nb.read('category') result =nb.read('result') test_x = nb.read('test_x') test_file = nb.read('test_file') m,n=train_x.shape temp = np.ones((m,1)) train_x = np.column_stack((temp,train_x)) temp = np.ones((len(test_x),1)) test_x = np.column_stack((temp,test_x)) predict = np.zeros((len(test_x),1)) train_x = np.mat(train_x) train_y = np.mat(train_y).transpose() test_x = np.mat(test_x) #由于要实现多分类,我们可以通过多个二分类来实现预测 for i in range(10): binary_y = np.mat(np.zeros((m,1)).astype(int)) for index in range(len(train_y)): if train_y[index]==i: binary_y[index]=1 else: binary_y[index]=0 weight = np.mat(np.ones((n+1,1))) alpha = 0.0001 maxitem = 100 for k in range(maxitem): h = sigmoid(train_x*weight) #我们在计算代价函数的时候,不能简单的用公式实现,应当进行判断 J = calj(binary_y,h,m) #J = 1.0/m*(-binary_y.transpose()*np.log2(h)-(1-binary_y.transpose())*np.log2(1-h)) error = h-binary_y weight -= alpha*(train_x.transpose()*error) binary_predict = test_x*weight for index in range(len(binary_predict)): if binary_predict[index]>0: predict[index]=i predict = np.array(predict).astype(int) test_file = np.array(test_file) predict = np.column_stack((test_file,predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_own(): train_x = nb.read('train_x') train_y = nb.read('train_y') category = nb.read('category') result = nb.read('result') test_x = nb.read('test_x') test_file = nb.read('test_file') m, n = train_x.shape temp = np.ones((m, 1)) train_x = np.column_stack((temp, train_x)) temp = np.ones((len(test_x), 1)) test_x = np.column_stack((temp, test_x)) predict = np.zeros((len(test_x), 1)) train_x = np.mat(train_x) train_y = np.mat(train_y).transpose() test_x = np.mat(test_x) #由于要实现多分类,我们可以通过多个二分类来实现预测 for i in range(10): binary_y = np.mat(np.zeros((m, 1)).astype(int)) for index in range(len(train_y)): if train_y[index] == i: binary_y[index] = 1 else: binary_y[index] = 0 weight = np.mat(np.ones((n + 1, 1))) alpha = 0.0001 maxitem = 100 for k in range(maxitem): h = sigmoid(train_x * weight) #我们在计算代价函数的时候,不能简单的用公式实现,应当进行判断 J = calj(binary_y, h, m) #J = 1.0/m*(-binary_y.transpose()*np.log2(h)-(1-binary_y.transpose())*np.log2(1-h)) error = h - binary_y weight -= alpha * (train_x.transpose() * error) binary_predict = test_x * weight for index in range(len(binary_predict)): if binary_predict[index] > 0: predict[index] = i predict = np.array(predict).astype(int) test_file = np.array(test_file) predict = np.column_stack((test_file, predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict, category_convert, result, path)
def logistic_x(): train_x,train_y,category,result,test_x,test_file = preprocess() clf = LogisticRegression(penalty='l1') clf.fit(train_x,train_y) predict = clf.predict(test_x) predict = np.array(predict) predict = np.column_stack((test_file,predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict,category_convert,result,path)
def logistic_x(): train_x, train_y, category, result, test_x, test_file = preprocess() clf = LogisticRegression(penalty='l1') clf.fit(train_x, train_y) predict = clf.predict(test_x) predict = np.array(predict) predict = np.column_stack((test_file, predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict, category_convert, result, path)
def sto_logistic(): train_x = nb.read('train_x') train_y = nb.read('train_y') category = nb.read('category') result =nb.read('result') test_x = nb.read('test_x') test_file = nb.read('test_file') m,n=train_x.shape temp = np.ones((m,1)) train_x = np.column_stack((temp,train_x)) temp = np.ones((len(test_x),1)) test_x = np.column_stack((temp,test_x)) predict = np.zeros((len(test_x),1)) train_x = np.mat(train_x) train_y = np.mat(train_y).transpose() test_x = np.mat(test_x) #由于要实现多分类,我们可以通过多个二分类来实现预测 for i in range(10): binary_y = np.mat(np.zeros((m,1)).astype(int)) for index in range(len(train_y)): if train_y[index]==i: binary_y[index]=1 else: binary_y[index]=0 weight = np.mat(np.ones((n+1,1))) alpha = 0.001 maxitem =5000 for k in range(maxitem): index = random.randrange(m) h = sigmoid(train_x[index]*weight) error = h - binary_y[index] weight -= alpha*(train_x[index].transpose()*error) binary_predict = test_x*weight for index in range(len(binary_predict)): if binary_predict[index]>0: predict[index]=i predict = np.array(predict).astype(int) test_file = np.array(test_file) predict = np.column_stack((test_file,predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict,category_convert,result,path)
def sto_logistic(): train_x = nb.read('train_x') train_y = nb.read('train_y') category = nb.read('category') result = nb.read('result') test_x = nb.read('test_x') test_file = nb.read('test_file') m, n = train_x.shape temp = np.ones((m, 1)) train_x = np.column_stack((temp, train_x)) temp = np.ones((len(test_x), 1)) test_x = np.column_stack((temp, test_x)) predict = np.zeros((len(test_x), 1)) train_x = np.mat(train_x) train_y = np.mat(train_y).transpose() test_x = np.mat(test_x) #由于要实现多分类,我们可以通过多个二分类来实现预测 for i in range(10): binary_y = np.mat(np.zeros((m, 1)).astype(int)) for index in range(len(train_y)): if train_y[index] == i: binary_y[index] = 1 else: binary_y[index] = 0 weight = np.mat(np.ones((n + 1, 1))) alpha = 0.001 maxitem = 5000 for k in range(maxitem): index = random.randrange(m) h = sigmoid(train_x[index] * weight) error = h - binary_y[index] weight -= alpha * (train_x[index].transpose() * error) binary_predict = test_x * weight for index in range(len(binary_predict)): if binary_predict[index] > 0: predict[index] = i predict = np.array(predict).astype(int) test_file = np.array(test_file) predict = np.column_stack((test_file, predict)) category = nb.read('category_nb_eventmodel') category_convert = nb.convert(category) result = nb.read('result') path = './data/logistic_l1.csv' evaluate = nb.sta_result(predict, category_convert, result, path)
def gifunc(): train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) all_tokens = get_all_tokens() tokens_gi = gi(train,sta,all_tokens) nb.write(tokens_gi,'tokens_gi') category = nb.read('category_nb_eventmodel') tokens_gi = nb.read('tokens_gi') category_convert = nb.convert(category) tokens_all_gi = [] gi_category = {} for i in range(10): tokens = sorted(tokens_gi,key=tokens_gi.get,reverse=True) gi_category[i] = tokens[:100] tokens_all_gi = set(tokens_all_gi)|set(gi_category[i]) print tokens_all_gi nb.write(gi_category,'gi_category') nb.write(tokens_all_gi,'tokens_all_gi')
def dffunc(): #这里先使用文档频率,需要统计在某一类别下词t在多少个文档中出现 ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) all_tokens = get_all_tokens() tokens_df = df(train,sta,all_tokens) nb.write(tokens_df,'tokens_df') ''' category = nb.read('category_nb_eventmodel') tokens_df = nb.read('tokens_df') category_convert = nb.convert(category) tokens_all_df = [] df_category = {} for i in range(10): tokens = sorted(tokens_df,key=tokens_df.get,reverse = True) df_category[i] = tokens[:200] tokens_all_df = set(tokens_all_df)|set(df_category[i]) print tokens_all_df nb.write(df_category,'df_category') nb.write(tokens_all_df,'tokens_all_df')
def mi_func(): ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) category_tokens = get_category_tokens() token_mi = mi(train,sta,category_tokens) nb.write(token_mi,'token_mi') ''' category = nb.read('category_nb_eventmodel') token_mi = nb.read('token_mi') category_convert = nb.convert(category) tokens_all_mi = [] mi_category = {} for i in range(10): tokens = sorted(token_mi[i],key = token_mi[i].get,reverse = True) mi_category[i] = tokens[:500] tokens_all_mi = set(tokens_all_mi)|set(mi_category[i]) print len(tokens_all_mi) nb.write(mi_category,'mi_category') nb.write(tokens_all_mi,'tokens_all_mi') '''