def run_feature_select(self, article_market_path, dictionary_path, \ feature_market_path) : loader = PickleMarket() articles = loader.load_market(article_market_path) [word2id, id2word] = loader.load_market(dictionary_path) dim = len(word2id) featuresets = list() length = len(articles) - 1 for idx, article in enumerate(articles) : feature = [0] * dim for word in article['participle_title'] : word = word.to_string() if word in word2id : feature[word2id[word]] += 1 for word in article['participle_content'] : word = word.to_string() if word in word2id : feature[word2id[word]] += 1 featuresets.append([article['id']] + feature + [article['label']]) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) file_operator = TextFileOperator() # file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_convert_sentences(self, sentences_path, sentences_market_path) : file_operator = TextFileOperator() sentences = file_operator.reading(sentences_path) sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences] loader = PickleMarket() loader.dump_market(sentences, sentences_market_path) print 'converting sentences finished ...'
def run_create_dictionary(self, article_market_path, dictionary_path, dict_set) : loader = PickleMarket() word2id, id2word = dict(), dict() index = 0 for type in dict_set.split('#') : path = article_market_path.replace(u'car', type) articles = loader.load_market(path) length = len(articles) - 1 for idx, article in enumerate(articles) : for word in article['participle_title'] : word = word.to_string() if word not in word2id : word2id[word] = index id2word[index] = word index += 1 for word in article['participle_content'] : word = word.to_string() if word not in word2id : word2id[word] = index id2word[index] = word index += 1 if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) loader = PickleMarket() loader.dump_market([word2id, id2word], dictionary_path)
def run_feature_select( self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path ): loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=15, combined=False) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles): article["features"] = list() article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"])) article["features"].extend( token_selector.extract_feature( article["title"], article["content"], article["participle_title"], article["participle_content"] ) ) article["features"].extend( word_selector.extract_feature(article["participle_title"], article["participle_content"]) ) print "finish rate is %.2f%%\r" % (100.0 * idx / length), print "finish rate is %.2f%%\r" % (100.0 * idx / length) featuresets = [pos_selector.names + token_selector.names + word_selector.names] featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles]) file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print "finish"
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() loader.dump_market(tags, tags_martket_path) loader.dump_market(tag_tree.dict_tuple, dict_market_path) print '%.2f%% article >= 1 tags, number is, %d.' \ % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \ % len([tag for tag in tags_show if len(tag) >= 1])
def run_test(self, tag_tree_path, sentences_market_path, tags_path, \ tags_martket_path, untag_sentence_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() self.write_tags(sentences, tags_show, tags_path) loader.dump_market(tags, tags_martket_path) file_operator.writing(untag_sentences, untag_sentence_path) # loader.dump_market(untag_sentences, sentences_market_path) # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \ feature_path, feature_market_path) : loader = PickleMarket() pos_selector = selector.PosExtractor(pos_path, w=5, combined=True) token_selector = selector.TokenExtractor(punc_path) word_selector = selector.WordExtractor(klword_path, weight=1) articles = loader.load_market(article_market_path) length = len(articles) - 1 for idx, article in enumerate(articles) : article['features'] = list() article['features'].extend(pos_selector.extract_feature(article['participle_content'])) article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \ article['participle_title'], \ article['participle_content'])) article['features'].extend(word_selector.extract_feature(article['participle_title'], \ article['participle_content'])) print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles] file_operator = TextFileOperator() file_operator.writing(featuresets, feature_path) loader.dump_market(featuresets, feature_market_path) print 'finish'
def run_convert_sentences(self, sentences_path, sentences_market_path) : file_operator = TextFileOperator() sentences = self.read_sentences(sentences_path) loader = PickleMarket() loader.dump_market(sentences, sentences_market_path)
def run_convert_article(self, article_path, article_market_path) : articles = self.read_article(article_path) # articles = self.read_participle(articles, participle_path) loader = PickleMarket() loader.dump_market(articles, article_market_path) print 'finish.'