def run(self): page = self.queue.get() self._index_dict = {} wiki_parser = parser.WikiParser() close_queue = False ct, it, c = 0,0,0 while True: if type(page) == int: close_queue = True elif page: self._insertPage(page) page, index = self._getPage() if page: try: if debug: start = time.time() page_index = wiki_parser.createPageIndex(page) ct += time.time() - start start = time.time() index.addPageIndex(page_index) it += time.time() - start c += 1 else: page_index = wiki_parser.createPageIndex(page) index.addPageIndex(page_index) if page.final: if debug: start = time.time() index.writeIntermediateIndex() it += time.time() - start else: index.writeIntermediateIndex() if debug: print("CAVG", ct/c, "IAVG", it/c) ct, it, c = 0,0,0 print("Final", page.id, page.shard_no) self._index_dict.pop(page.shard_no) index = None except Exception as e: #print("Errror", page.id) # raise e pass elif close_queue and not page: break if not close_queue: try: page = self.queue.get_nowait() except: page = None for shard_no, index in self._index_dict.items(): #print(shard_no, len(index[0])) index = index[1] self._word_set[index.shard_no] = index.getWords() index.writeIntermediateIndex() #self._index_dict.pop(shard_no) index = None
def __init__(self, process_count, index_loc = "./"): if not os.path.exists(index_loc): os.mkdir(index_loc) self.index_loc = index_loc #self.process_list = [IndexCreatorProcess(index_loc) for i in range(process_count)] #for process in self.process_list: # process.start() self.wiki_parser = parser.WikiParser() self.shard_count = 0 self.index = None
def isValid(self): wikiParser = parser.WikiParser() is_valid = True if not wikiParser.pageExists(self.start): print("the page " + self.start + " doesn't exist") is_valid = False if not wikiParser.pageExists(self.end): print("the page " + self.end + " doesn't exist") is_valid = False return is_valid
def randomPlayer(self): self.__init__(self.start, self.end) print("start : " + self.start + ", goal : " + self.end) wikiParser = parser.WikiParser() while self.current != self.end and self.score < self.max_iter: page = wikiParser.getPage(self.current) next_words = wikiParser.getLinksFromPage(page) if self.end in next_words: self.current = self.end self.visited.append(self.current) self.score += 1 print("I win") return else: #select random r = random.randint(0, len(next_words) - 1) self.current = next_words[r] self.visited.append(self.current) self.score += 1
def nlpMeanPlayer(self): self.__init__(self.start, self.end) print("Start : " + self.start + ", Goal : " + self.end) wikiParser = parser.WikiParser() while self.current != self.end and self.score < self.max_iter: page = wikiParser.getPage(self.current) next_words = wikiParser.getLinksFromPage(page) max_similarity = 0 max_word = "" # filter to prevent looping for word in next_words: if word in self.visited: next_words.remove(word) if next_words == []: print("I am stuck") return for word in next_words: if self.end in next_words: self.current = self.end self.visited.append(self.current) self.score += 1 print("Found page :", self.end) return else: #select most similar with warnings.catch_warnings(): warnings.simplefilter("ignore") similarity = spacyTools.getMeanSimilarity( word, self.end) if similarity > max_similarity and not (word in self.visited): max_similarity = similarity max_word = word if (max_word == ""): print("I am stuck in a page without unvisited links") return self.visited.append(max_word) self.current = max_word self.score += 1 print("I loose")