Пример #1
0
 def run_feature_select(self, article_market_path, dictionary_path, \
                        feature_market_path) :
     loader = PickleMarket()
     articles = loader.load_market(article_market_path)
     [word2id, id2word] = loader.load_market(dictionary_path)
     dim = len(word2id)
     featuresets = list()
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         feature = [0] * dim
         for word in article['participle_title'] :
             word = word.to_string()
             if word in word2id :
                 feature[word2id[word]] += 1
         for word in article['participle_content'] :
             word = word.to_string()
             if word in word2id :
                 feature[word2id[word]] += 1
         featuresets.append([article['id']] + feature + [article['label']])
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     file_operator = TextFileOperator()
     # file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print 'finish'
Пример #2
0
 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = file_operator.reading(sentences_path)
     sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences]
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
     print 'converting sentences finished ...'
Пример #3
0
 def run_create_dictionary(self, article_market_path, dictionary_path, dict_set) :
     loader = PickleMarket()
     word2id, id2word = dict(), dict()
     index = 0
     for type in dict_set.split('#') :
         path = article_market_path.replace(u'car', type)
         articles = loader.load_market(path)
         length = len(articles) - 1
         for idx, article in enumerate(articles) :
             for word in article['participle_title'] :
                 word =   word.to_string()
                 if word not in word2id :
                     word2id[word] = index
                     id2word[index] = word
                     index += 1
             for word in article['participle_content'] :
                 word = word.to_string()
                 if word not in word2id :
                     word2id[word] = index
                     id2word[index] = word
                     index += 1
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     loader = PickleMarket()
     loader.dump_market([word2id, id2word], dictionary_path)
Пример #4
0
 def run_feature_select(
     self, article_market_path, pos_path, punc_path, klword_path, feature_path, feature_market_path
 ):
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=15, combined=False)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles):
         article["features"] = list()
         article["features"].extend(pos_selector.extract_feature_windows(article["participle_content"]))
         article["features"].extend(
             token_selector.extract_feature(
                 article["title"], article["content"], article["participle_title"], article["participle_content"]
             )
         )
         article["features"].extend(
             word_selector.extract_feature(article["participle_title"], article["participle_content"])
         )
         print "finish rate is %.2f%%\r" % (100.0 * idx / length),
     print "finish rate is %.2f%%\r" % (100.0 * idx / length)
     featuresets = [pos_selector.names + token_selector.names + word_selector.names]
     featuresets.extend([[article["id"]] + article["features"] + [article["label"]] for article in articles])
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print "finish"
Пример #5
0
 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     loader.dump_market(tags, tags_martket_path)
     loader.dump_market(tag_tree.dict_tuple, dict_market_path)
     print '%.2f%% article >= 1 tags, number is, %d.' \
         % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \
         % len([tag for tag in tags_show if len(tag) >= 1])
Пример #6
0
 def run_test(self, tag_tree_path, sentences_market_path, tags_path, \
     tags_martket_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     self.write_tags(sentences, tags_show, tags_path)
     loader.dump_market(tags, tags_martket_path)
     file_operator.writing(untag_sentences, untag_sentence_path)
     # loader.dump_market(untag_sentences, sentences_market_path)
     # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
     print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
Пример #7
0
 def run_feature_select(self, article_market_path, pos_path, punc_path, klword_path, \
     feature_path, feature_market_path) :
     loader = PickleMarket()
     pos_selector = selector.PosExtractor(pos_path, w=5, combined=True)
     token_selector = selector.TokenExtractor(punc_path)
     word_selector = selector.WordExtractor(klword_path, weight=1)
     articles = loader.load_market(article_market_path)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         article['features'] = list()
         article['features'].extend(pos_selector.extract_feature(article['participle_content']))
         article['features'].extend(token_selector.extract_feature(article['title'], article['content'], \
                                                                     article['participle_title'], \
                                                                     article['participle_content']))
         article['features'].extend(word_selector.extract_feature(article['participle_title'], \
                                                                     article['participle_content']))
         print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     featuresets = [[article['id']] + article['features'] + [article['label']] for article in articles]
     file_operator = TextFileOperator()
     file_operator.writing(featuresets, feature_path)
     loader.dump_market(featuresets, feature_market_path)
     print 'finish'
Пример #8
0
 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = self.read_sentences(sentences_path)
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
Пример #9
0
 def run_convert_article(self,  article_path, article_market_path) :
     articles = self.read_article(article_path)
     # articles = self.read_participle(articles, participle_path)
     loader = PickleMarket()
     loader.dump_market(articles, article_market_path)
     print 'finish.'