예제 #1
0
	def run(self):
		page = self.queue.get()
		self._index_dict = {}
		wiki_parser = parser.WikiParser()

		close_queue = False
		ct, it, c = 0,0,0
		while True:
			if type(page) == int:
				close_queue = True
			elif page:
				self._insertPage(page)
			page, index = self._getPage()
			if page:
				try:
					if debug:
						start = time.time()
						page_index = wiki_parser.createPageIndex(page)
						ct += time.time() - start
						start = time.time()
						index.addPageIndex(page_index)
						it += time.time() - start
						c += 1
					else:
						page_index = wiki_parser.createPageIndex(page)
						index.addPageIndex(page_index)

					if page.final:
						if debug:
							start = time.time()
							index.writeIntermediateIndex()
							it += time.time() - start
						else:
							index.writeIntermediateIndex()
						if debug:
							print("CAVG", ct/c, "IAVG", it/c)
							ct, it, c = 0,0,0
							print("Final", page.id, page.shard_no)
						self._index_dict.pop(page.shard_no)
						index = None
				except Exception as e:
					#print("Errror", page.id)
					# raise e
					pass
			elif close_queue and not page:
				break

			if not close_queue:
				try:
					page = self.queue.get_nowait()
				except:
					page = None

		for shard_no, index in self._index_dict.items():
			#print(shard_no, len(index[0]))
			index = index[1]
			self._word_set[index.shard_no] = index.getWords()
			index.writeIntermediateIndex()
			#self._index_dict.pop(shard_no)
			index = None
예제 #2
0
	def __init__(self, process_count, index_loc = "./"):
		if not os.path.exists(index_loc):
			os.mkdir(index_loc)
		self.index_loc = index_loc
		#self.process_list = [IndexCreatorProcess(index_loc) for i in range(process_count)]
		#for process in self.process_list:
		#	process.start()
		self.wiki_parser = parser.WikiParser()
		self.shard_count = 0
		self.index = None
예제 #3
0
 def isValid(self):
     wikiParser = parser.WikiParser()
     is_valid = True
     if not wikiParser.pageExists(self.start):
         print("the page " + self.start + " doesn't exist")
         is_valid = False
     if not wikiParser.pageExists(self.end):
         print("the page " + self.end + " doesn't exist")
         is_valid = False
     return is_valid
예제 #4
0
 def randomPlayer(self):
     self.__init__(self.start, self.end)
     print("start : " + self.start + ", goal : " + self.end)
     wikiParser = parser.WikiParser()
     while self.current != self.end and self.score < self.max_iter:
         page = wikiParser.getPage(self.current)
         next_words = wikiParser.getLinksFromPage(page)
         if self.end in next_words:
             self.current = self.end
             self.visited.append(self.current)
             self.score += 1
             print("I win")
             return
         else:  #select random
             r = random.randint(0, len(next_words) - 1)
             self.current = next_words[r]
         self.visited.append(self.current)
         self.score += 1
예제 #5
0
    def nlpMeanPlayer(self):
        self.__init__(self.start, self.end)
        print("Start : " + self.start + ", Goal : " + self.end)
        wikiParser = parser.WikiParser()
        while self.current != self.end and self.score < self.max_iter:
            page = wikiParser.getPage(self.current)
            next_words = wikiParser.getLinksFromPage(page)
            max_similarity = 0
            max_word = ""

            # filter to prevent looping
            for word in next_words:
                if word in self.visited:
                    next_words.remove(word)
                if next_words == []:
                    print("I am stuck")
                    return

            for word in next_words:
                if self.end in next_words:
                    self.current = self.end
                    self.visited.append(self.current)
                    self.score += 1
                    print("Found page :", self.end)
                    return
                else:  #select most similar
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        similarity = spacyTools.getMeanSimilarity(
                            word, self.end)
                    if similarity > max_similarity and not (word
                                                            in self.visited):
                        max_similarity = similarity
                        max_word = word
            if (max_word == ""):
                print("I am stuck in a page without unvisited links")
                return
            self.visited.append(max_word)
            self.current = max_word
            self.score += 1
        print("I loose")