def generate_rake_keywords(self): if os.path.exists(self.keyword_file) and os.path.exists( self.rake_score_file): print( 'Already have the files [' + self.keyword_file + ', ' + self.rake_score_file + ']', ', directly load them.') self.load() return self.dataset.load() if self.path is None: self.path = self.dataset.default_path ger_stop_words = stopwords.words('german') stop_words = stopwords.words('english') stop_words.extend(ger_stop_words) stop_words.extend(['via', 'using', 'fr']) r = Rake(stop_words) r.extract_keywords_from_sentences(self.dataset.titles) path = os.path.join(self.path, 'keyword.dat') print('generate keywords', end='', flush=True) with open(path, 'wb') as f: i = 0 for title in self.dataset.titles: i += 1 if i % 100000 == 0: print('.', end='', flush=True) phrases = r.generate_phrases(title) phrases = [' '.join(phrase) for phrase in phrases] self.keywords.append(phrases) pickle.dump(self.keywords, f) self.rake_scores = r.phrase_score path = os.path.join(self.path, 'rake_score.dat') with open(path, 'wb') as f: pickle.dump(self.rake_scores, f) print('done')
from rake import Rake from nltk.corpus import stopwords starttime = datetime.datetime.now() ger_stop_words = stopwords.words('german') stop_words = stopwords.words('english') stop_words.extend(ger_stop_words) stop_words.extend(['via', 'using', 'fr']) r = Rake(stop_words) with open(r'dblp_index/title.dat', encoding='utf-8') as f_title: titles = [] for line in f_title: titles.append(line) r.extract_keywords_from_sentences(titles) print('generate keywords', end='', flush=True) with open(r'dblp_index/keywords.dat', 'a', encoding='utf-8') as keywords: with open(r'dblp_index/title.dat', encoding='utf-8') as titles: i = 0 for line in titles: i += 1 if i % 10000 == 0: print('.', end='', flush=True) phrases = r._generate_phrases(line) phrases_scores = [] for phrase in phrases: true_phrase = '_'.join(phrase) score = r.phrase_score[true_phrase] phrases_scores.append(true_phrase + ":" + str(score)) keywords.write(','.join(phrases_scores) + '\n') print('done')