def main(): parser = OptionParser() parser.add_option("-f", "--filename", type="string", dest="file", default="DataBaseDB", help="NEED CORRECT PARAMETERS") (options, args) = parser.parse_args() if options.file == "error": print "hello world" sys.exit(1) task_file = options.file t_path = config("../conf/dp.conf") data_path = t_path["data_path"] L_list = [] task_list, c_dict = get_task(data_path) item_list, l_dict = c_to_l(c_dict, min_sup, 1) i_len = 1 while l_dict != {}: L_list.append(l_dict) i_len += 1 com_list = list(itertools.combinations(item_list, i_len)) l_dict = get_ldict(task_list, com_list, min_sup, i_len) get_rules(L_list)
def main(): parser = OptionParser() parser.add_option("-f","--filename",type="string",dest="file",default="DataBaseDB",help="NEED CORRECT PARAMETERS") (options,args) = parser.parse_args() if options.file == "error": print "hello world" sys.exit(1) task_file = options.file t_path = config("../conf/dp.conf") data_path = t_path["data_path"] L_list = [] task_list,c_dict = get_task(data_path) item_list,l_dict = c_to_l(c_dict,min_sup,1) i_len = 1 while l_dict != {}: L_list.append(l_dict) i_len += 1 com_list = list(itertools.combinations(item_list,i_len)) l_dict = get_ldict(task_list,com_list,min_sup,i_len) get_rules(L_list)
def main(options): dp = config("../conf/dp.conf") #1 merge if options.merge == True: print "合并训练测试集" merge(dp) elif options.split==True: print "将得到的数据分开" sp(dp,options.tp) else: print "error 没有这个选项" sys.exit(1)
def main(): parser = OptionParser() parser.add_option("-m", "--model", dest="model", \ help=u"选择模型:可选择的有LR,RF,NB", metavar="your_model",default="LR") parser.add_option("-t","--tokenize",dest="tokenize",action="store_true",\ help=u"选择是否进行tokenize,tokenize会得到稍微高一点的准确率,但是效率会慢很多,默认是true",\ metavar="your_tokenize",default=False) parser.add_option("-n","--nontext",dest="nontext",action="store_true",\ help=u"选择是否利用非文本特征,默认是false",default=False) parser.add_option("-l","--LSA",dest="LSA",action="store_true",\ help=u"选择是否LSA,注意当选用非LR模型的时候,LSA是必须默认开着的,这个在后来我会强制一下逻辑,现在没写",\ default=False) parser.add_option("-s","--fselect",dest="fs",action="store_true", help=u"选择是否进行特征选择,默认是否,加上-s后会进行选择",default=False) parser.add_option("-p","--topic",dest="topic",action="store_true",\ help=u"选择是否读取主题分布,默认是否,加上-tp后会进行读取",default=False) parser.add_option("-c","--combine",dest="combine",action="store_true",\ help=u"选择是否进行模型融合,默认是否",default=False) (options, args) = parser.parse_args() print options #读入配置文件 dp = config("../conf/dp.conf") #读入数据 print "读取数据集" train,test,y,label,train_nontext,test_nontext = data(dp,options.tokenize) print "train 大小",len(train) print "test 大小",len(test) print "读取主题" total_topic = topic(dp) train_topic = total_topic[:len(train)] test_topic = total_topic[len(train):] print "train 大小",len(train_topic) print "test 大小",len(test_topic) if options.combine==False: result = train_model(train,test,y,options,train_topic,test_topic,train_nontext,test_nontext) print "产生结果" gen_submission(dp,result,label) else: combine_model(dp,train,test,y,label,train_topic,test_topic,train_nontext,test_nontext)
#coding=utf-8 from word import word from read_conf import config from nlp import NLP import numpy as np import os from sklearn import linear_model from logistic_nd import LogisticRegression data_conf = config('../conf/dp.conf') tr_data_path = data_conf['train_path'] te_data_path = data_conf['test_path'] cat_dict = {'acq':0,'corn':1,'crude':2,'earn':3,'grain':4,'interest':5,'money-fx':6,'ship':7,'trade':8,'wheat':9} nlp = NLP() def get_doc_num(path): docs_dict = {'doc_sum':0} doc_dir = os.listdir(path) for doc_cat in doc_dir: file_list = os.listdir(path+doc_cat) docs_dict[cat_dict[doc_cat]] = len(file_list) docs_dict['doc_sum'] += docs_dict[cat_dict[doc_cat]] return docs_dict def get_voc_set(): word_dict = {} word_no = 0 doc_dir = os.listdir(tr_data_path) for doc_cat in doc_dir:
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm # as described in: # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). # Automatic keyword extraction from indi-vidual documents. # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. import re import operator import nltk from read_conf import config dp = config("../conf/dp.conf") def is_number(s): try: float(s) if '.' in s else int(s) return True except ValueError: return False def load_stop_words(stop_word_file): """ Utility function to load stop words from a file and return as a list of words @param stop_word_file Path and file name of a file containing stop words. @return list A list of stop words. """ stop_words = [] for line in open(stop_word_file): if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line
import csv cat_dic = { 'acq': 0, 'corn': 1, 'crude': 2, 'earn': 3, 'grain': 4, 'interest': 5, 'money-fx': 6, 'ship': 7, 'trade': 8, 'wheat': 9 } t_path = config("../conf/dp.conf") train_path = t_path["train_path"] test_path = t_path["test_path"] wordset_path = t_path["wordset_path"] stopword = stop_set(t_path["stopword_path"]) pattern = r'''[a-zA-Z]+''' def get_num(): num_set = {} doc_num = 0 doc_dir = os.listdir(train_path) for dd in doc_dir: f_list = os.listdir(train_path + dd) num_set[cat_dic[dd]] = len(f_list)
#coding=utf-8 from read_conf import config import jieba import re import os import sys reload(sys) sys.setdefaultencoding('utf-8') t_path = config('../conf/dp.conf') def get_stop_list(path): stop_list = [] with open(path,'rb') as infile: lines = infile.readlines() line_num = 1 for line in lines: if line_num < 278: stop_list.append(line.rstrip().decode('gbk')) line_num += 1 else: break infile.close() stop_list.append(' ') return stop_list def get_word_pinyin_dict(path): word_dict = {} with open(path,'rb') as infile: lines = infile.readlines()
#coding=utf-8 ''' 我也来封装一下呗 ''' from nltk import regexp_tokenize from nltk.stem import WordNetLemmatizer #import textblob #from textblob.tokenizers import SentenceTokenizer as sent_tok #from textblob.tokenizers import WordTokenizer as word_tok from read_conf import config stopwords = open(config('../conf/dp.conf')['stopword_path']) stopwords = stopwords.readlines() stopwords = [item.strip() for item in stopwords] pattern = r'''[a-zA-Z]+''' class NLP(object): def __init__(self): #self.__wordnetlem = WordNetLemmatizer() #self.__stokenizer = sent_tok() #self.__wtokenizer = word_tok() self.__stopwords = set(stopwords) def word_tokenize(self, document): tokens = regexp_tokenize(document, pattern) tokens = [item.lower() for item in tokens] tokens = [item for item in tokens if item not in stopwords] return tokens
#coding=utf-8 ''' 我也来封装一下呗 ''' from nltk import regexp_tokenize from nltk.stem import WordNetLemmatizer #import textblob #from textblob.tokenizers import SentenceTokenizer as sent_tok #from textblob.tokenizers import WordTokenizer as word_tok from read_conf import config stopwords = open(config('../conf/dp.conf')['stopword_path']) stopwords = stopwords.readlines() stopwords = [item.strip() for item in stopwords] pattern = r'''[a-zA-Z]+''' class NLP(object): def __init__(self): #self.__wordnetlem = WordNetLemmatizer() #self.__stokenizer = sent_tok() #self.__wtokenizer = word_tok() self.__stopwords = set(stopwords) def word_tokenize(self,document): tokens = regexp_tokenize(document,pattern) tokens = [item.lower() for item in tokens] tokens = [item for item in tokens if item not in stopwords] return tokens '''
#coding: utf-8 from read_conf import config def corpus(file_dir,name): f = open(file_dir) result = f.readlines() if name == "my": result = [i.split(":")[0] for i in result] else: result = [i.split()[0] for i in result] return set(result) if __name__ == '__main__': conf = config("lda.conf") my_dir = conf["words_dir"] blei_dir = "/home/lavi/publishrepo/lda/ap/vocab.txt" my_corpus = corpus(my_dir,"my") blei_corpus = corpus(blei_dir,"blei") common = 0 for word in my_corpus: if word in blei_corpus: common += 1 print "my:%s"%(1.0*common/len(my_corpus)) print "blei:%s"%(1.0*common/len(blei_corpus))
#coding: utf-8 ''' author:yaoming 这个文件和主体算法无关,只是我用来计算学校ID和老师ID的set的个数来看是否有价值将这两条作为feature加进去 ''' from read_conf import config import csv, sys import numpy as np def count_different(conf, col_num_list): teacher_id, school_id1, school_id2 = [], [], [] with open(conf["project"], 'r') as pf: reader = csv.reader(pf) for line in reader: teacher_id.append(line[col_num_list[0]]) school_id1.append(line[col_num_list[1]]) school_id2.append(line[col_num_list[2]]) print "teacher id" print len(set(teacher_id)) print "school id1" print len(set(school_id1)) print "school id2" print len(set(school_id2)) if __name__ == "__main__": dp_conf = config("../conf/dp.conf") col_list = [1, 2, 3] count_different(dp_conf, col_list)
''' 数据、配置的载入 ''' from csv import DictReader from read_conf import config from item import item from optparse import OptionParser import unittest import pickle import sys rawconf_dir = '../conf/raw_data.conf' dbconf_dir = '../conf/db.conf' raw_conf = config(rawconf_dir) db_conf = config(dbconf_dir) def get_raw_conf(): return raw_conf def get_db_conf(): return db_conf def get_one_item(op, data_path): infile = open(data_path, 'rb') for idx, row in enumerate(DictReader(infile)):
# coding: utf-8 """ author:yaoming 这个文件和主体算法无关,只是我用来计算学校ID和老师ID的set的个数来看是否有价值将这两条作为feature加进去 """ from read_conf import config import csv, sys import numpy as np def count_different(conf, col_num_list): teacher_id, school_id1, school_id2 = [], [], [] with open(conf["project"], "r") as pf: reader = csv.reader(pf) for line in reader: teacher_id.append(line[col_num_list[0]]) school_id1.append(line[col_num_list[1]]) school_id2.append(line[col_num_list[2]]) print "teacher id" print len(set(teacher_id)) print "school id1" print len(set(school_id1)) print "school id2" print len(set(school_id2)) if __name__ == "__main__": dp_conf = config("../conf/dp.conf") col_list = [1, 2, 3] count_different(dp_conf, col_list)
s = open(conf["trans_dir"]) t = open(conf["reduction_trans_dir"],"w") reader = csv.reader(s) a = 0 for line in reader: if a == 0: a += 1 continue if line[3] in category or line[4] in company or line[5] in brand: write_str = ','.join(line) t.write(write_str+"\n") if a % 10000 == 0: print a a += 1 if __name__ == '__main__': print "hello" data_position_conf = config("../conf/data_position.conf") offer = extract_offer(data_position_conf) reduct_transactions(data_position_conf,offer)
print "cross validation",np.mean(cross_validation.cross_val_score(clf,train,y,cv=3,scoring='roc_auc',n_jobs=3)) elif ctype == "predict": clf.fit(train,y) predict = clf.predict_proba(test)[:,1] f = open(conf["result_essay"],"w") f.write("projectid,is_exciting\n") for it in range(len(test_id)): f.write("%s,%s\n"%(test_id[it],predict[it])) if __name__ == '__main__': print "hello" #读取数据文件的conf文件,获取地址 dp = config("../conf/dp.conf") if len(sys.argv)!=2: print "usage python essay_bench.py <usage>" print "usage:split=> train test essay split" print "usage:get_y=> get well writen y" print "usage:train=> fit train and predict test" sys.exit(1) if sys.argv[1] == "split": #step 1: 先把train和test分开 #sub step 1: 先获取test文件所有的id test_id = get_test_id(dp) #sub step 2: 读取所有的essay文件,在此分割,并且将其写入train和test文件中
import pickle import os import nltk import csv from read_conf import config from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn import linear_model from sklearn.neighbors import KNeighborsClassifier from sklearn import svm from sklearn.decomposition import PCA cat_dic = {'acq':0,'corn':1,'crude':2,'earn':3,'grain':4,'interest':5,'money-fx':6,'ship':7,'trade':8,'wheat':9} t_path = config("../conf/dp.conf") train_path = t_path["train_path"] test_path = t_path["test_path"] def handle_doc(word_set,rs_path): doc_dir = os.listdir(rs_path) doc_matrix = [] doc_cat = [] for docs in doc_dir: files = os.listdir(rs_path+docs) print "start to handle the --> "+docs for file_d in files: d_path = rs_path+docs+'/'+file_d #get the single file path with open(d_path,'rb') as text_file: str_tmp = ''
#coding=utf-8 from word import word from read_conf import config from nlp import NLP import numpy as np import os from sklearn import linear_model from logistic_nd import LogisticRegression data_conf = config('../conf/dp.conf') tr_data_path = data_conf['train_path'] te_data_path = data_conf['test_path'] cat_dict = { 'acq': 0, 'corn': 1, 'crude': 2, 'earn': 3, 'grain': 4, 'interest': 5, 'money-fx': 6, 'ship': 7, 'trade': 8, 'wheat': 9 } nlp = NLP() def get_doc_num(path): docs_dict = {'doc_sum': 0}