示例#1
0
 def generate_rake_keywords(self):
     if os.path.exists(self.keyword_file) and os.path.exists(
             self.rake_score_file):
         print(
             'Already have the files [' + self.keyword_file + ', ' +
             self.rake_score_file + ']', ', directly load them.')
         self.load()
         return
     self.dataset.load()
     if self.path is None:
         self.path = self.dataset.default_path
     ger_stop_words = stopwords.words('german')
     stop_words = stopwords.words('english')
     stop_words.extend(ger_stop_words)
     stop_words.extend(['via', 'using', 'fr'])
     r = Rake(stop_words)
     r.extract_keywords_from_sentences(self.dataset.titles)
     path = os.path.join(self.path, 'keyword.dat')
     print('generate keywords', end='', flush=True)
     with open(path, 'wb') as f:
         i = 0
         for title in self.dataset.titles:
             i += 1
             if i % 100000 == 0:
                 print('.', end='', flush=True)
             phrases = r.generate_phrases(title)
             phrases = [' '.join(phrase) for phrase in phrases]
             self.keywords.append(phrases)
         pickle.dump(self.keywords, f)
     self.rake_scores = r.phrase_score
     path = os.path.join(self.path, 'rake_score.dat')
     with open(path, 'wb') as f:
         pickle.dump(self.rake_scores, f)
     print('done')