def gen_posting_list(self, offset, size, idf): """ Generate a memory based posting list from a file :param offset: start of the posting list in the file :param size: size of the posting list :param idf: idf of the token :return: a memory based posting list with it size """ posting_list = PostingList() pl_size = 0 with open(self.__pl_file, "rb") as file: SC.last_query().add_mem_access() file.seek(offset) if self.__use_vbytes: bytes_read = file.read(size) numbers = VariableByte.decoding(bytes_read) for i in range(0, len(numbers), 2): doc_id = numbers[i] score = idf * (1 + log10(numbers[i + 1])) posting_list.add_document(doc_id, score) pl_size += 1 else: read = 0 while read < size: pl_size += 1 doc_id = int.from_bytes(file.read(4), byteorder='big') score = idf * (1 + log10( int.from_bytes(file.read(4), byteorder='big'))) posting_list.add_document(doc_id, score) read += 4 + 4 return posting_list, pl_size
def ordered_access(self): ''' allows to access each elements of the list ordered by their score :return: a tuple (document_id, score) ''' for elem in self.ord_elems: if SC.last_query() is not None: SC.last_query().add_pl_access() yield elem
def alpha_access(self): """ access documents by alphabetical order :return: tuple (doc_id, score) """ for (key, val) in self.rand_elems.items(): if SC.last_query() is not None: SC.last_query().add_pl_access() yield (key, val)
def document_score(self, document_id): ''' return the score of a document in log time :param document_id: the document :return: the score of the document ''' if SC.last_query() is not None: SC.last_query().add_pl_access() if document_id in self.rand_elems: return self.rand_elems[document_id] return 0
def query(self, query="", algorithm="NAIVE", number_of_results=5): """ Query the inverted file for documents :param query: the query :param algorithm: the name fo the algorithm to use (a key in ALGORITHM) :param number_of_results: the number of results expected :return: an array of array containing [doc_id, score, path to the file containing the documents] """ SC.new_query(query) self.current_status = "Querying - Using {} alogrithm".format(algorithm) documents = self.ALGORITHMS[algorithm]().execute( query, self.inv_file, number_of_results) SC.last_query().stop() if documents is not None: SC.last_query().log(algorithm, number_of_results, len(documents)) else: SC.last_query().log(algorithm, number_of_results, 0) results = [] if documents is not None: for document in documents: results.append([ document[0], document[1], self.__id_to_filename[document[0]] ]) self.current_status = "Querying - Finished" return results
def on_indexation_complete(self): print("Indexation complete !") # When indexation is finished Change this to get vocabulary from inverted file # Eventualy manage exception if vocabular inexistent vocabulary = self.backend.inv_file.get_terms() liststore = Gtk.ListStore(str) for s in vocabulary: liststore.append([s]) completion = Gtk.EntryCompletion() completion.set_model(liststore) completion.set_text_column(0) entry = self.builder.get_object("search_entry") entry.set_completion(completion) loading_box = self.builder.get_object("loading_box") indexation_statistics_box = self.builder.get_object( "indexation_statistics_box") query_box = self.builder.get_object("query_box") start_indexation_button = self.builder.get_object( "start_indexation_button") loading_box.set_visible(False) indexation_stats = StatsControl.last_indexing() indexation_start_time_tofill = self.builder.get_object( "indexation_start_time_tofill") indexation_start_time_tofill.set_text("{:%H:%M:%S.%f}".format( indexation_stats.start_time)) indexation_end_time_tofill = self.builder.get_object( "indexation_end_time_tofill") indexation_end_time_tofill.set_text("{:%H:%M:%S.%f}".format( indexation_stats.finish_time)) indexation_total_time_tofill = self.builder.get_object( "indexation_total_time_tofill") indexation_total_time_tofill.set_text("{}".format( indexation_stats.total_time)) indexation_file_size_tofill = self.builder.get_object( "indexation_file_size_tofill") indexation_file_size_tofill.set_text(str(indexation_stats.file_size)) indexation_statistics_box.set_visible(True) query_box.set_visible(True) start_indexation_button.set_sensitive(True)
def indexing(self, files, ignore_case=True, ignore_stop_words=True, stemming=True, use_weights=True, title_weight=5, date_weight=2, memory_limit=50, use_vbytes=True): """ Launch the indexing of a list of files :param files: the paths to the files to index :param ignore_case: should case be ignored in the indexing ? :param ignore_stop_words: should stop words be ignored ? :param stemming: should we stemm the tokens ? :param use_weights: shoud we differenciate word with their position in the document ? :param title_weight: weight for words in title :param date_weight: weight for words in the date :param memory_limit: limit on the memory before a flush in a temp file :param use_vbytes: usage of variable bytes for the final posting list ? :return: when the indexing is finished """ SC.new_indexing() documents = [] self.current_status = "Indexing - Starting" self.__id_to_filename = SortedDict() self.inv_file = InvertedFile(use_vbytes, memory_limit) for file in files: self.current_status = "Indexing - {}".format(file) file_docs = Reader.read_file(file, ignore_case, ignore_stop_words, stemming, use_weights, title_weight, date_weight) for doc in file_docs: self.__id_to_filename[int(doc.doc_id())] = file self.inv_file.add_document(doc) self.current_status = "Indexing - Making the inverted file" self.inv_file.gen_pl_file() self.current_status = "Indexing - Saving to pickle file" with open(self.PICKLES[0], "wb") as file: pickle.dump(self.inv_file, file) with open(self.PICKLES[1], "wb") as file: pickle.dump(self.__id_to_filename, file) self.current_status = "Indexing - Finished - You can query" SC.last_indexing().stop() SC.last_indexing().log(files, ignore_case, ignore_stop_words, stemming, use_weights, title_weight, date_weight, memory_limit, use_vbytes)
def on_query_complete(self, results): print("Query complete !") query_stats = StatsControl.last_query() loading_box = self.builder.get_object("loading_box") loading_box.set_visible(False) start_time_tofill = self.builder.get_object("start_time_tofill") start_time_tofill.set_text("{:%H:%M:%S.%f}".format( query_stats.start_time)) end_time_tofill = self.builder.get_object("end_time_tofill") end_time_tofill.set_text("{:%H:%M:%S.%f}".format( query_stats.finish_time)) total_time_tofill = self.builder.get_object("total_time_tofill") total_time_tofill.set_text("{}".format(query_stats.total_time)) pl_accesses_tofill = self.builder.get_object("pl_accesses_tofill") pl_accesses_tofill.set_text(str(query_stats.pl_accesses)) disk_accesses_tofill = self.builder.get_object("disk_accesses_tofill") disk_accesses_tofill.set_text(str(query_stats.memory_accesses)) results_text = "\t Score |\tDOCID |\t File path \n" for result in results: results_text += ("\t{:8.5f} |\t{:8} |\t{}".format( result[1], result[0], result[2])) + "\n" print("results" + results_text) results_textview = self.builder.get_object("results_textview") results_textview_buffer = results_textview.get_buffer() results_textview_buffer.set_text(results_text) results_box = self.builder.get_object("results_box") results_box.set_visible(True) start_query_button = self.builder.get_object("start_query_button") start_query_button.set_sensitive(True)
return filelist file_paths = get_filelist_from_folderpath("latests") exe = Executable() algorithm = DEFAULT_ALGORITHM number_of_results = DEFAULT_NUMBER_OF_RESULTS memorylimit = 200 exe.indexing(file_paths, memory_limit=memorylimit) print(SC.last_indexing()) try: in_res = int(input("Number of results desired ? ").strip()) number_of_results = in_res except ValueError: print("Non-int value entered using default {}".format( DEFAULT_NUMBER_OF_RESULTS)) print("Algorithm description :") for (name, desc) in ALGORITHMS_DESC.items(): print("{}\t- {}".format(name, desc)) in_alg = input("Choose your algorithm : ").strip().upper() if in_alg not in ALGORITHMS_DESC: algorithm = DEFAULT_ALGORITHM
def gen_pl_file(self): self._dump(self.tmp_path, self.tmp_voc) del self.tmp_voc self.tmp_files_path.append(self.tmp_path) tmp_files = [] tmp_used = [] for path in self.tmp_files_path: tmp_used.append(False) tmp_files.append(open(path, "r")) tmp_lines = [] for file in tmp_files: tmp_lines.append(file.readline()) self.vocabulary_of_term = SortedDict() self.vectors_of_term = SortedDict() #HERE offset = 0 while True: min_term = '' min_lists = [] for i in range(len(tmp_files)): if tmp_used[i] and tmp_lines[i] != '': tmp_lines[i] = tmp_files[i].readline() tmp_used[i] = tmp_lines[i] == '' if tmp_lines[i] != '': term = tmp_lines[i].split('\t')[0] if i == 0: min_term = term if term < min_term: min_term = term min_lists = [i] elif term == min_term: min_lists.append(i) if min_term == '': break pl_size = 0 pl_string = "" for i in min_lists: split = tmp_lines[i].split('\t') pl_string = "{}{}".format(pl_string, split[2].replace("\n", ",")) tmp_used[i] = True freq = 0 term_rdm_index = [0] * RandomIndex.get_n() # HERE with open(self.__postinglist_file_path, "ab") as file: if_doc_id = True # HERE for val in pl_string.split(","): if val != '': if if_doc_id: # HERE term_rdm_index += self.doc_id_vectors_list[ val] #HERE term_rdm_index = list( map(lambda x, y: x + y, term_rdm_index, self.doc_id_vectors_list[val])) # HERE freq += 1 if self.use_vbytes: bytes_val = VariableByte.encoding_number(int(val)) else: bytes_val = int(val).to_bytes(4, byteorder='big', signed=False) pl_size += file.write(bytes_val) if_doc_id = not if_doc_id # HERE self.vectors_of_term[min_term] = term_rdm_index # HERE # print(self.vectors_of_term[min_term]) # HERE # print() # HERE idf = log10(self.nb_docs / (1 + (freq / 2))) self.vocabulary_of_term[min_term] = (offset, pl_size, idf) offset += pl_size SC.last_indexing().add_pl_size(offset) for file in tmp_files: file.close() for file_path in self.tmp_files_path: os.remove(file_path) self.__postinglist_gen = FileToPostingLists( self.__postinglist_file_path, self.use_vbytes)