def run_create_sentences(self, article_path, participle_title_path, sentences_path) : articles = self.read_article(article_path) titles = self.read_participle_title(participle_title_path) # remove duplications processor = Unique() indexs_unique = [titles[index]['id'] for index in processor.unique( \ [article['participle_title'] for article in titles])] indexs_dict = dict().fromkeys(set(indexs_unique)) remained_articles = [article for article in articles if article['id'] in indexs_dict] print 'remove duplications finished ...' # create sentences segmentor = ContentSegementor() sentences = list() length = len(remained_articles) - 1 for idx, article in enumerate(remained_articles) : segmented_content = segmentor.segement(article['content']) sentences.extend([[sentence] for sentence in segmented_content]) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() file_operator.writing(sentences, sentences_path) print 'writing sentences finished ...'
def run_feature_select( self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path ): loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=15, combined=False) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"] ) ) article["features"].extend( word_selector.extract_feature(article["participle_title"], article["participle_content"]) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) featuresets = [pos_selector.names + token_selector.names + word_selector.names] featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles]) file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print "finish"
def read_article(self, article_path) : """ Read source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ file_operator = TextFileOperator() data_list = file_operator.reading(article_path) entry_list = data_list[0] source_list = [] length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['url'] = data[1] article['title'] = data[2] article['content'] = data[3] article['participle_title'] = [Word(word) for word in data[4].split(' ')] article['participle_content'] = [Word(word) for word in data[5].split(' ')] article['label'] = data[6] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list
def run_convert_sentences(self, sentences_path, sentences_market_path) : file_operator = TextFileOperator() sentences = file_operator.reading(sentences_path) sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences] loader = PickleMarket() loader.dump_market(sentences, sentences_market_path) print 'converting sentences finished ...'
def read_participle(self, articles, participle_path) : """ Read participle title. Each row is an article. Colunm[0] is the id of article. Column[1] is the word of participle title. Column[2] is the word of participle content. """ file_operator = TextFileOperator() data_list = file_operator.reading(participle_path) entry_list = data_list[0] article_dict = dict() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['participle_title'] = [Word(word) for word in data[1].split(' ')] article['participle_content'] = [Word(word) for word in data[2].split(' ')] article_dict[article['id']] = article if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) length = len(articles) - 1 for idx, article in enumerate(articles) : if article['id'] in article_dict : article['participle_title'] = article_dict[article['id']]['participle_title'] article['participle_content'] = article_dict[article['id']]['participle_content'] if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return articles
def run_classify(self, train_path, test_path, train_set, test_set, output_path) : loader = PickleMarket() # read train feature_names = loader.load_market(train_path)[0] train_articles = list() for type in train_set.split('#') : path = train_path.replace(u'car', type) train_articles.extend(loader.load_market(path)[1:]) train_dataset = np.array([np.array(article[1:30000], dtype=float) for article in train_articles]) print train_dataset.shape train_label = np.array([np.array(int(article[-1])) for article in train_articles]) # read test test_articles = list() for type in test_set.split('#') : path = test_path.replace(u'car', type) test_articles.extend(loader.load_market(path)[1:]) test_dataset = np.array([np.array(article[1:30000]) for article in test_articles]) print test_dataset.shape test_label = np.array([np.array(int(article[-1])) for article in test_articles]) # train cls classifier = LrClassifier() train_dataset = classifier.normalize(train_dataset, method='mapminmax') test_dataset = classifier.normalize(test_dataset, method='mapminmax') classifier.training(train_dataset, train_label, c=10, kernel='linear') # test cls test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evls, fprs, tprs = classifier.evaluation(test_label, test_prob, test_class) print 'performance is', evls ftprs = [[fpr, tprs[idx]] for idx, fpr in enumerate(fprs)] file_operator = TextFileOperator() file_operator.writing(ftprs, output_path) print 'finish'
def read_sentences(self, source_path, type='all') : """ Read participle sentences. Each row is a sentence. """ file_operator = TextFileOperator() data_list = file_operator.reading(source_path) entry_list = data_list[0] sentences = list() length = len(data_list[1:]) - 1 if type == 'all' : for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = [Word(word, sp_char=':').to_string() for word in data[0].split(' ')] sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) elif type == 'name' : for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = [Word(word, sp_char=':').name for word in data[0].split(' ')] sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return sentences
def read_article(self, article_path) : """ Read source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ file_operator = TextFileOperator() data_list = file_operator.reading(article_path) entry_list = data_list[0] source_list = [] length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['url'] = data[1] article['pub_time'] = data[2] article['title'] = data[3] article['content'] = data[4] article['n_zan'] = data[5] article['n_forward'] = data[6] article['n_click'] = data[7] article['n_collect'] = data[8] article['read_time'] = data[9] article['finish_rate'] = data[10] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list
def _read_dictionary(self, pos_path) : file_operator = TextFileOperator() data_list = file_operator.reading(pos_path) dictionary = dict() for data in data_list : if len(data) >= 1 : dictionary[data[0]] = 0 return dictionary
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_path, untag_sentence_path) : file_operator = TextFileOperator() sentences = [u'技能贴 | 黑色打底裤的10种正确穿搭方式', u'春季男鞋韩版潮流行英伦男士休闲鞋'] cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:1000]) print 'finish'
def run_robot(self, tag_tree_path, sentences_market_path, tags_path) : robot = Robot() loader = PickleMarket() file_operator = TextFileOperator() cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) sentences = loader.load_market(sentences_market_path) tags = loader.load_market(tags_path) print 'start' string = raw_input().decode('gb18030') # string = u'我想要毛衣' sentences = robot.question_and_answer(string, sentences, tags, tag_tree)
def run_create_word2vec(self, sentences_path, word_embedding_path, word_embedding_market_path) : loader = PickleMarket() sentences = list() for type in [u'_car']:#, u'_finance', u'_web'] : sentences.extend(loader.load_market(sentences_path + type)) print 'import finish ...' embeddor = WordEmbed() print sentences[0] model = embeddor.word_to_vector(type='create', sentences=sentences[0:100], path=word_embedding_market_path) data_list = embeddor.get_word2vec_model(model) file_operator = TextFileOperator() file_operator.writing(data_list, word_embedding_path) print 'create word2vec finished ...'
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() loader.dump_market(tags, tags_martket_path) loader.dump_market(tag_tree.dict_tuple, dict_market_path) print '%.2f%% article >= 1 tags, number is, %d.' \ % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \ % len([tag for tag in tags_show if len(tag) >= 1])
def run_test(self, tag_tree_path, sentences_market_path, tags_path, \ tags_martket_path, untag_sentence_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() self.write_tags(sentences, tags_show, tags_path) loader.dump_market(tags, tags_martket_path) file_operator.writing(untag_sentences, untag_sentence_path) # loader.dump_market(untag_sentences, sentences_market_path) # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
def read_sentences(self, sentences_path) : """ Read participle sentences. Each row is a sentence. """ file_operator = TextFileOperator() data_list = file_operator.reading(sentences_path) entry_list = data_list[0] sentences = list() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = data[0].upper() sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return sentences
def _read_dictionary(self, split_path) : self.file_operator = TextFileOperator() data_list = self.file_operator.reading(split_path) split_dict = dict() for data in data_list : if len(data) >= 1 : if data[0] not in split_dict : split_dict[data[0]] = None return split_dict
def write_article(self, articles, article_path) : """ Write source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ data_list = list() entry_list = ['id', 'url', 'title', 'content'] data_list.append(entry_list) length = len(articles) - 1 for idx, article in enumerate(articles) : data = [article['id'], article['url'], article['title'], article['content']] data_list.append(data) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() file_operator.writing(data_list, article_path)
def read_participle_title(self, title_path) : """ Read participle title. Each row is an article. Colunm[0] is the id of article. Column[1:] is the word of participle title. """ file_operator = TextFileOperator() data_list = file_operator.reading(title_path) entry_list = data_list[0] source_list = list() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['participle_title'] = [Word(word) for word in data[1].split(' ')] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list
def write_tags(self, sentences, tags, tags_path) : """ Read participle sentences. Each row is a sentence. Each column is a <attribute, value> pair. """ file_operator = TextFileOperator() data_list = list() data_list.append(['sentence', 'tag']) length = len(tags) - 1 for idx, term in enumerate(tags) : if len(term) >= 2 : data = list() data.append(sentences[idx][1]) tag_str = '' for attr, value in term : tag_str += u'<' + attr + u',' + value + u'>' + ' ' data.append(tag_str) data_list.append(data) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator.writing(data_list, tags_path)
def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \ feature_path, feature_market_path) : loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=5, combined=True) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles] file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_optimize_params( self, train_article_market_path, test_article_market_path, pos_path, punc_path, klword_path, logger_path ): loader = PickleMarket() logger = list() logger.append( [ "w", "combined", "weight", "kernel", "c", "norm", "car_car", "car_finance", "car_web", "finance_car", "finance_finance", "finance_web", "web_car", "web_fiannce", "web_web", "merge_car", "merge_finance", "merge_web", ] ) domains = [u"car", u"finance", u"web"] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ["linear", "poly", "rbf"] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ["mapminmax", "zscore"] token_selector = selector.TokenExtractor(punc_path) for w in wset: for combined in combinedset: pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset: word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)): train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains): train_articles = loader.load_market(train_article_market_path.replace(u"all", domain)) test_articles = loader.load_market(test_article_market_path.replace(u"all", domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) train_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in train_articles ] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"], ) ) article["features"].extend( word_selector.extract_feature( article["participle_title"], article["participle_content"] ) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) test_featuresets[index] = [ [article["id"]] + article["features"] + [article["label"]] for article in test_articles ] for kernel in kernelset: for c in cset: for norm in normset: evl = list() for train_idx in range(0, len(domains)): for test_idx in range(0, len(domains)): train_dataset = np.array( [np.array(article[1:-1]) for article in train_featuresets[train_idx]] ) train_label = np.array( [np.array(int(article[-1])) for article in train_featuresets[train_idx]] ) test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "single finished ..." # merge articles = list() for train_idx in range(0, len(domains)): articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)): test_dataset = np.array( [np.array(article[1:-1]) for article in test_featuresets[test_idx]] ) test_label = np.array( [np.array(int(article[-1])) for article in test_featuresets[test_idx]] ) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type="prob") test_class = classifier.testing(test_dataset, type="label") evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print "merge finished ..." print "performance is", 1.0 * sum(evl) / len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print "finish"
def run_optimize_params(self, train_article_market_path, test_article_market_path, \ pos_path, punc_path, klword_path, logger_path) : loader = PickleMarket() logger = list() logger.append(['w', 'combined', 'weight', 'kernel', 'c', 'norm', 'car_car', \ 'car_finance', 'car_web', 'finance_car', 'finance_finance', 'finance_web', \ 'web_car', 'web_fiannce', 'web_web', 'merge_car', 'merge_finance', 'merge_web']) domains = [u'car', u'finance', u'web'] wset = [5, 10, 15, 20] combinedset = [True, False] weightset = [1, 2, 5] kernelset = ['linear', 'poly', 'rbf'] cset = [range(10, 100, 10), range(100, 1000, 100)] normset = ['mapminmax', 'zscore'] token_selector = selector.TokenExtractor(punc_path) for w in wset : for combined in combinedset : pos_selector = selector.PosExtractor(pos_path, w=w, combined=combined) for weight in weightset : word_selector = selector.WordExtractor(klword_path, weight=weight) train_featuresets, test_featuresets = list(), list() for step in range(len(domains)) : train_featuresets.append(list()) test_featuresets.append(list()) for index, domain in enumerate(domains) : train_articles = loader.load_market(train_article_market_path.replace(u'all', domain)) test_articles = loader.load_market(test_article_market_path.replace(u'all', domain)) # train length = len(train_articles) - 1 for idx, article in enumerate(train_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) train_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in train_articles] # test length = len(test_articles) - 1 for idx, article in enumerate(test_articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) test_featuresets[index] = [[article['id']] + article['features'] + [article['label']] for article in test_articles] for kernel in kernelset : for c in cset : for norm in normset : evl = list() for train_idx in range(0, len(domains)) : for test_idx in range(0, len(domains)) : train_dataset = np.array([np.array(article[1:-1]) for article in train_featuresets[train_idx]]) train_label = np.array([np.array(int(article[-1])) for article in train_featuresets[train_idx]]) test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'single finished ...' # merge articles = list() for train_idx in range(0, len(domains)) : articles.extend(train_featuresets[train_idx]) train_dataset = np.array([np.array(article[1:-1]) for article in articles]) train_label = np.array([np.array(int(article[-1])) for article in articles]) for test_idx in range(0, len(domains)) : test_dataset = np.array([np.array(article[1:-1]) for article in test_featuresets[test_idx]]) test_label = np.array([np.array(int(article[-1])) for article in test_featuresets[test_idx]]) classifier = SvmClassifier() train_dataset = classifier.normalize(train_dataset, method=norm) test_dataset = classifier.normalize(test_dataset, method=norm) classifier.training(train_dataset, train_label, cset=c, kernel=kernel) test_prob = classifier.testing(test_dataset, type='prob') test_class = classifier.testing(test_dataset, type='label') evl.append(classifier.evaluation(test_label, test_prob, test_class)[1]) print 'merge finished ...' print 'performance is', 1.0*sum(evl)/len(evl) log = [w, combined, weight, kernel, c[0], norm] log.extend(evl) logger.append(log) file_operator = TextFileOperator() file_operator.writing(logger, logger_path) print 'finish'