def _obtain_candidate_keywords(list_all_dialogs, candi_kw_path, min_kw_freq=1, load_file_if_exists=True): r"""Obtain and save the candidate keywords used for extracting keywords. Inputs: list_all_dialogs, candi_kw_path, load_file_if_exists # TODO - **list_all_dialogs**: - **candi_kw_path**: - **load_file_if_exists**: Outputs: candi_keywords - **candi_keywords**: a 'list' containing all the candidate keywords """ if load_file_if_exists: if os.path.isfile(candi_kw_path): with open(candi_kw_path,'r') as f: candi_keywords = [kw.strip() for kw in f.readlines()] print('Loading candidate keywords from {}'.format(candi_kw_path)) print('Total candidate keywords count: ', len(candi_keywords)) return candi_keywords if not list_all_dialogs: raise Exception('no dialogs provided for obtaining candidate keywords') candi_kw_dir = os.path.dirname(candi_kw_path) if not os.path.exists(candi_kw_dir): os.makedirs(candi_kw_dir) print('Obtaining candidate keywords...') # initialization candi_keywords = [] kw_counter = collections.Counter() kw_extractor = KeywordExtractor() # extract possible keywords for dialog in tqdm(list_all_dialogs): for utterance in dialog: cur_keywords = kw_extractor.candi_extract(utterance) kw_counter.update(cur_keywords) candi_keywords.extend(cur_keywords) # delete the keywords occurring less than specified times (indicated by 'min_kw_freq'). rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < min_kw_freq] candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= min_kw_freq] # delete keywords containing only one single letter single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2] candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2] # print the information of candidate keywords print('rare keywords count: ', len(rare_keywords)) print('single letter keywords count: ', len(single_letter_keywords)) print('total candidate keywords count(before cleaning): ', len(kw_counter.items())) print('total candidate keywords count(after cleaning): ', len(candi_keywords)) print('Saving candi_keywords into {}...'.format(candi_kw_path)) with open(candi_kw_path,'w') as f: for keyword in candi_keywords: f.write(keyword + '\n') return candi_keywords
def create_tf_idf(file_path): reader = TrainingTextReader(file_path) keywords = KeywordExtractor(reader.articles[10], 'useless.txt') vector_index = Vectorizer(keywords.article_sents_tokened) freq_mat = vector_index.frequencyMatrix normalized_vector = VectorNormalizer(freq_mat) norm_mat = normalized_vector.l2_norm_matrice tf_idf = InverseDocumentFrequency(norm_mat) return tf_idf.tf_idf_matrice
def _obtain_candidate_keywords(self, load_file_if_exists=True): r"""Obtains and saves the candidate keywords used for extracting keywords. Args: load_file_if_exists: A 'bool' indicating whether load candi_keywords file if it exists. Returns: candi_keywords: A 'list' containing all the candidate keywords. """ if load_file_if_exists: candi_keywords_name = '../data/{}/candi_keywords.txt'.format(self.output_data_dir) if os.path.isfile(candi_keywords_name): with open(candi_keywords_name,'r') as f: candi_keywords = [kw.strip() for kw in f.readlines()] print('Loading candidate keywords from {}'.format(candi_keywords_name)) print('Total candidate keywords count: ', len(candi_keywords)) return candi_keywords print('Obtaining candidate keywords...') # Initialization candi_keywords = [] kw_counter = collections.Counter() kw_extractor = KeywordExtractor() # Extracts possible keywords. for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: cur_keywords = kw_extractor.candi_extract(utterance) kw_counter.update(cur_keywords) candi_keywords.extend(cur_keywords) # Deletes the keywords occurring less than specified times rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < self.min_kw_freq] candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= self.min_kw_freq] # Deletes keywords containing only one single letter single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2] candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2] # Writes candidate keywords into file candidate_keywords_output_path = '../data/{}/candi_keywords.txt'.format( self.output_data_dir) with open(candidate_keywords_output_path,'w') as f: for keyword in candi_keywords: f.write(keyword + '\n') return candi_keywords
def handle(self, *args, **options): questions = Question.objects.all() ke = KeywordExtractor() for question in questions: if question.date.year != 2016 or question.date.month != 7: continue question.keywords = [] keywords = ke.get_keywords(question.question) print ",".join(keywords) for keyword in keywords: m, created = Keyword.objects.get_or_create(keyword=keyword) m.keyword = keyword question.keywords.add(m) m.save() question.save()
def __init__(self): self.keyword_extractor = KeywordExtractor() self.publisher_id_to_name = {} self.platform_id_to_name = {} self.theme_id_to_name = {} self.genre_id_to_name = {} self.game_mode_id_to_name = {} self.game_keyword_id_to_name = {} self.fetch_publishers = self.__add_attr_to_game_data( 'publishers', 'companies', self.publisher_id_to_name) self.fetch_platforms = self.__add_attr_to_game_data( 'platform', 'platforms', self.platform_id_to_name) self.fetch_themes = self.__add_attr_to_game_data( 'themes', 'themes', self.theme_id_to_name) self.fetch_genres = self.__add_attr_to_game_data( 'genres', 'genres', self.genre_id_to_name) self.fetch_game_modes = self.__add_attr_to_game_data( 'game_modes', 'game_modes', self.game_mode_id_to_name)
def __init__(self, dataset_name, output_data_dir, separator, min_kw_freq, context_turns, set_names): self.dataset_name = dataset_name self.output_data_dir = output_data_dir self.separator = separator self.min_kw_freq = min_kw_freq self.context_turns = context_turns self.set_names = set_names self._make_data_dir_if_not_exists() self._load_raw_dialog_data() # Initializes keyword extractor candi_keywords = self._obtain_candidate_keywords() idf_dict = self._calculate_idf() self.kw_extractor = KeywordExtractor(candi_keywords, idf_dict) self._obtain_and_save_uttr_kw_mapping() # uttr_kw_mapping: (utterances -> keywords) mapping self._obtain_and_save_vocab()
parser.add_argument('--candi_kw_path', type=str, help='path of candidate keywords file') parser.add_argument('--input_text_path', type=str, help='path of dialog text that need extracting keywords') parser.add_argument('--kw_output_path', type=str, help='path of dialog text that need extracting keywords') args = parser.parse_args() output_info = 'Start keyword extraction [dataset: {}, file: {}]'.format( args.dataset_name, args.input_text_path) print('-' * len(output_info)) print(output_info) print('-' * len(output_info)) # initialize keyword extractor try: candi_keywords = _obtain_candidate_keywords(None, args.candi_kw_path) idf_dict = _calculate_idf(None, args.idf_path) kw_extractor = KeywordExtractor(candi_keywords, idf_dict) except Exception as err: print('Exception: ', err) # load all dialogs of the specific dataset dataset = load_dataset(args.dataset_name, args.dataset_dir) candi_keywords = _obtain_candidate_keywords(dataset, args.candi_kw_path) idf_dict = _calculate_idf(dataset, args.idf_path) kw_extractor = KeywordExtractor(candi_keywords, idf_dict) # load texts that need extracting keywords texts = load_texts(args.input_text_path) # extract keywords extract_keywords(texts, kw_extractor, args.kw_output_path) print('Done.')
from keyword_extractor import KeywordExtractor import argparse ap = argparse.ArgumentParser() ap.add_argument("--word2vec", default=None, help="path to word2vec pre-trained embeddings") ap.add_argument("--data", required=True, help="path to file from which keywords are to be extracted") args = ap.parse_args() with open(args.data, 'r') as data_file: lines = data_file.readlines() extractor = KeywordExtractor(word2vec=args.word2vec) for text in lines: keywords = extractor.extract(text, ratio=0.2, split=True, scores=True) for keyword in keywords: print(keyword)
from bs4 import BeautifulSoup import urllib3 import random # Custom Libs from article_lister import ArticleLister from keyword_extractor import KeywordExtractor from news_db_storer import NewsDBStorer cache_dir = "./pkl_cache/" dbstore = NewsDBStorer(db_name="newsarticlesdb", table_name="politician_based_newsarticlestable") dbstore.set_up_connection() keyword_xtractor = KeywordExtractor() class GenericNewsScraper: def __init__(self, paper_name="cnn", base_url="https://www.cnn.com/"): self.articles = [] self.base_url = base_url self.paper_name = paper_name self.art_obj = set() # Loads article cache def load_articles(self): f = open(cache_dir + self.paper_name + ".pkl", 'rb') cache_obj = pickle.load(f) f.close() return cache_obj