def start_parsing(title2id_path, replacement_symbol, language, directory, host, cat_name, spliterator): title2id = load_title_to_id(title2id_path) nlp_writer = NLPDocumentWriter(language=language, file2write_path=directory, title2id=title2id, replacement_symbol=replacement_symbol) graphdata_writer = GraphDataWriter(file2write_path=directory, title2id=title2id, language=language, replacement_symbol=replacement_symbol, cat_name=cat_name) with open('{}/{}.title2id.txt'.format(directory, language), 'r', encoding='utf-8')as r: for line in r.read().split('\n'): id_, title = line.split(spliterator, 1) try: page_name = quote(title) url = 'http://{}/localhost/v3/page/html/{}'.format(host, page_name.replace('/', '%2F')) wiki_page = WikiPage(url, ADAPTERS[language]) text = wiki_page.get_text() sections = wiki_page.get_sections() links = wiki_page.get_links() categories = wiki_page.get_categories() wiki_type = wiki_page.get_wiki_page_type() wiki_id = wiki_page.get_wiki_id() nlp_writer.write(wiki_id, text, links, wiki_type, categories, sections) graphdata_writer.write(wiki_id, text, links, wiki_type, categories, sections) with open('{}/success.txt'.format(directory), 'a', encoding='utf-8') as f: f.write(str(id_)+title+'\n') except Exception as e: print(e) with open('{}/error.txt'.format(directory), 'a', encoding='utf-8') as f: f.write(str(id_)+" "+title+" "+str(e)+'\n')
def interrogative(remaining_words): """ Function that handles interrogative senteces """ global current_concept leading_word = remaining_words[0][1][0] while leading_word == "D" or leading_word == "V" and len( remaining_words) > 0: remaining_words.pop(0) leading_word = remaining_words[0][1][0] else: concept_list = [word[0] for word in remaining_words if word[1] != "."] concept = " ".join(concept_list) current_concept = WikiPage(concept) if len(current_concept.summary) > 0 and "IN" not in dict( remaining_words).values(): return current_concept.summary # The concept is a nested concept. elif "IN" in dict(remaining_words).values(): return nested_concept(remaining_words) else: #Try urbandictionary after trying wiki. #slang_term = slang.define_term(concept) #if slang_term: #return slang_term[0] return "I don't know"
def run(self): page = WikiPage(spin_yarn(base_url + self.url), self.url) try: print("Inserting {} ...".format(self.url)) self.display_que.put((page, self.parent_id, None), block=True, timeout=2) except: print("Error in inserting {} in queue".format(self.url))
def nested_concept(remaining_words): remaining_words = remove_extraneous_words(remaining_words) in_found = False search_term = "" concept = "" for word in remaining_words: if word[1] == "IN": in_found = True else: if in_found: concept += word[0] concept += " " else: search_term += word[0] search_term += " " concept_base = WikiPage(concept) matching_sentences = " ".join(WikiPage(concept).search(search_term)[:4]) return matching_sentences
def move_forward(self, proposed_move): """ Function moves forward to new wikipage """ if proposed_move in self.current_wiki.links: self.wiki_stack.append(self.current_wiki) self.current_wiki = WikiPage( self.current_wiki.links[proposed_move]) self.score += 1
def run(self): root_page = WikiPage(spin_yarn(base_url + self.url), self.url) try: print("Inserting {} ...".format(self.url)) self.display_que.put((root_page, self.parent_id, self.id), block=True, timeout=2) except: print("Error in inserting {} in queue".format(self.url)) for i in range(25): added = False while not added: if len(self.workers) < self.max_threads: print("RootProcessor spawning new worker") new_worker = Worker(root_page.links[i], self.display_que, self.id) self.workers.append(new_worker) new_worker.start() added = True else: print("Cant insert presently!!") print(self.workers) time.sleep(1) self.clear_workers()
self.current_wiki.links[proposed_move]) self.score += 1 def move_backward(self): """ Function moves backwards to last wikipage """ if len(self.wiki_stack) > 0: self.current_wiki = self.wiki_stack.pop() self.score -= .5 if __name__ == '__main__': GAME_START = raw_input("Enter starting wiki:") GAME_END = raw_input("Enter ending wiki:") game = WikiGame(WikiPage(GAME_START), WikiPage(GAME_END)) while game.current_wiki.links.has_key(GAME_END) != True: print game.score print game.current_wiki.name print "***********************" for link in game.current_wiki.links: print link move = raw_input("Enter next move, or 'back' to pop back") if move == "back": game.move_backward() else: game.move_forward(move) for wiki_page in game.wiki_stack: print wiki_page.name print game.score