def load_memories(text_list, n): if os.path.exists('/tmp/memories.pickle'): with open('/tmp/memories.pickle', 'rb') as f: memory_lookup = pickle.load(f) else: memory_lookup = {} memory_size = len(memory_lookup) if memory_size == 0: # If everything is missing, then parallelize for speed sc = create_spark_context() memories = sc.parallelize(text_list, 256).map(lambda t: search(t, n=n)).collect() for text, mem in zip(text_list, memories): memory_lookup[text] = mem else: # If only some things are missing, use the cache and query what is missing memories = [] for text in text_list: if text in memory_lookup: memories.append(memory_lookup[text]) else: mem = search(text, n=n) memories.append(mem) memory_lookup[text] = mem if memory_size != len(memory_lookup): with open('/tmp/memories.pickle', 'wb') as f: pickle.dump(memory_lookup, f) return memories
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]): log.info('Predicting the instance_of attribute for guesses...') class_with_probability = self.test_instance_of(questions) n_cores = conf['guessers']['ESWikidata']['n_cores'] sc = create_spark_context( configs=[('spark.executor.cores', n_cores), ('spark.executor.memory', '20g')]) def ir_search(query_class_and_prob): query, class_and_prob = query_class_and_prob p_class, prob = class_and_prob return es_index.search(query, p_class, prob, self.confidence_threshold, normalize_score_by_length=self. normalize_score_by_length)[:max_n_guesses] spark_input = list(zip(questions, class_with_probability)) log.info('Filtering when classification probability > {}'.format( self.confidence_threshold)) return sc.parallelize(spark_input, 32 * n_cores).map(ir_search).collect()
def create_wikipedia_cache( parsed_wiki_path='data/external/wikipedia/parsed-wiki', output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QuestionDatabase() questions = list(db.all_questions().values()) train_questions = [ q for q in questions if q.fold == 'guesstrain' or q.fold == 'buzzertrain' ] answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, '*', '*') def parse_page(json_text): page = json.loads(json_text) return { 'id': int(page['id']), 'title': page['title'].replace(' ', '_'), 'text': page['text'], 'url': page['url'] } wiki_pages = sc.textFile(page_pattern).map(parse_page).filter( lambda p: p['title'] in b_answers.value).collect() wiki_lookup = {p['title']: p for p in wiki_pages} with open(output_path, 'w') as f: json.dump(wiki_lookup, f) return wiki_lookup
def create_wikipedia_cache(parsed_wiki_path='data/external/wikipedia/parsed-wiki', output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QantaDatabase() train_questions = db.train_questions answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, '*', '*') def parse_page(json_text): page = json.loads(json_text) return { 'id': int(page['id']), 'title': page['title'].replace(' ', '_'), 'text': page['text'], 'url': page['url'] } wiki_pages = sc.textFile(page_pattern).map(parse_page).filter(lambda p: p['title'] in b_answers.value).collect() wiki_lookup = {p['title']: p for p in wiki_pages} with open(output_path, 'w') as f: json.dump(wiki_lookup, f) return wiki_lookup
def create_wikipedia_cache( parsed_wiki_path="data/external/wikipedia/parsed-wiki", output_path=WIKI_LOOKUP_PATH): from qanta.spark import create_spark_context sc = create_spark_context() db = QantaDatabase() train_questions = db.train_questions answers = {q.page for q in train_questions} b_answers = sc.broadcast(answers) # Paths used in spark need to be absolute and it needs to exist page_path = os.path.abspath(parsed_wiki_path) page_pattern = os.path.join(page_path, "*", "*") def parse_page(json_text): page = json.loads(json_text) return { "id": int(page["id"]), "title": page["title"].replace(" ", "_"), "text": page["text"], "url": page["url"], } wiki_pages = (sc.textFile(page_pattern).map(parse_page).filter( lambda p: p["title"] in b_answers.value).collect()) wiki_lookup = {p["title"]: p for p in wiki_pages} with open(output_path, "w") as f: json.dump(wiki_lookup, f) return wiki_lookup
def add_sentences_(questions, parallel=True): text_questions = [q['text'] for q in questions] sc = create_spark_context() if parallel: sentence_tokenizations = sc.parallelize(text_questions, 4000).map(nlp).collect() else: sentence_tokenizations = [nlp(q) for q in text_questions] for q, text, tokenization in zip(questions, text_questions, sentence_tokenizations): q['tokenizations'] = tokenization # Get the 0th sentence, end character tokenization (tuple position 1) q['first_sentence'] = text[:tokenization[0][1]]
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]): n_cores = conf['guessers']['ElasticSearch']['n_cores'] sc = create_spark_context( configs=[('spark.executor.cores', n_cores), ('spark.executor.memory', '40g')]) b_is_human_model = sc.broadcast(self.is_human_model) def ir_search(query): is_human_model = b_is_human_model.value is_human_probability = is_human_model.predict_proba([query])[0][1] return es_index.search(query, is_human_probability)[:max_n_guesses] return sc.parallelize(questions, 4 * n_cores).map(ir_search).collect()
def create_memory_index(): dataset = QuizBowlDataset(guesser_train=True) training_data = dataset.training_data() answers = set(training_data[1]) cw = CachedWikipedia() try: Index('mem').delete() except: pass Answer.init() all_wiki_pages = [cw[page] for page in answers] wiki_pages = [p for p in all_wiki_pages if p.content != ''] sc = create_spark_context() sc.parallelize(wiki_pages, 1000).foreach(index_page)
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]): def es_search(query): return self.index.search( query, max_n_guesses, normalize_score_by_length=self.normalize_score_by_length, wiki_boost=self.wiki_boost, qb_boost=self.qb_boost ) if len(questions) > 1: sc = create_spark_context(configs=[('spark.executor.cores', self.n_cores), ('spark.executor.memory', '20g')]) return sc.parallelize(questions, 16 * self.n_cores).map(es_search).collect() elif len(questions) == 1: return [es_search(questions[0])] else: return []