def add_new_doc(self, document, num_of_tweets): """ This function perform indexing process for a document object. Saved information is captures via two dictionaries ('inverted index' and 'posting') :param lock: :param capital_letter_dict: :param document: a document need to be indexed. :return: - """ self.cur_num_of_tweets += 1 document_dictionary = document.term_doc_dictionary # Update tf-idf dict tweet_id = document.tweet_id self.tf_idf_dict[tweet_id] = [ ] # max_tf # distinct_words # tweet_length self.tf_idf_dict[tweet_id].append( (document.max_tf, document.distinct_words, document.doc_length)) # Go over each term in the doc term_list_to_LDA = [] if len(self.temp_posting_dict) < 500000 and document.doc_length != -1: for term in document_dictionary.keys(): try: # Update posting if term not in self.temp_posting_dict.keys(): self.temp_posting_dict[term] = [] self.temp_posting_dict[term].append([ document.tweet_id, document_dictionary[term][0], document_dictionary[term][1] ]) else: self.temp_posting_dict[term].append([ document.tweet_id, document_dictionary[term][0], document_dictionary[term][1] ]) except: print('problem with the following key {}'.format(term[0])) term_list_to_LDA.append(term) self.LDA_list.append(term_list_to_LDA) # add to LDA list self.tweet_line_dict[ document.tweet_id] = self.line_number # tweet_id, line_num self.line_number += 1 else: # len(self.temp_posting_dict) == 500000 #self.lock.acquire() if document.doc_length != -1: # copy temp_posting_dict self.copy_posting_dict = copy.deepcopy(self.temp_posting_dict) # empty temp_posting_dict self.temp_posting_dict.clear() # sort the dict self.sorted_posting_dict = collections.OrderedDict( sorted(self.copy_posting_dict.items())) # empty copy_posting_dict self.copy_posting_dict.clear() #print("*********************************************") # make a txt file out of the sorted_posting_dict with open(self.path + 'posting' + str(self.file_counter) + '.txt', 'w', encoding='utf-8') as fp: for p in self.sorted_posting_dict.items(): for str1 in p[1]: self.writen_terms += 1 s = p[0] + ":" + str(str1[0]) + "-" + str( str1[1]) + "-" + str(str1[2])[1:-1] fp.write(s + "\n") #print("*********************************************") # empty copy_posting_dict self.sorted_posting_dict.clear() self.file_name_list.append('posting' + str(self.file_counter) + '.txt') self.file_counter += 1 # write the corpus to the disk with open('LDA.txt', 'a', encoding='utf-8') as fp: for p in self.LDA_list: s = "" for term in p: s += term + " " fp.write(s + "\n") self.LDA_list.clear() #self.lock.release() if self.cur_num_of_tweets == num_of_tweets and len( self.temp_posting_dict) > 0: # if last tweet #self.lock.acquire() # copy temp_posting_dict self.copy_posting_dict = copy.deepcopy(self.temp_posting_dict) # empty temp_posting_dict self.temp_posting_dict.clear() # sort the dict self.sorted_posting_dict = collections.OrderedDict( sorted(self.copy_posting_dict.items())) # empty copy_posting_dict self.copy_posting_dict.clear() #print("*********************************************") # make a txt file out of the sorted_posting_dict with open(self.path + 'posting' + str(self.file_counter) + '.txt', 'w', encoding='utf-8') as fp: for p in self.sorted_posting_dict.items(): for str1 in p[1]: self.writen_terms += 1 s = p[0] + ":" + str(str1[0]) + "-" + str( str1[1]) + "-" + str(str1[2])[1:-1] fp.write(s + "\n") #print("*********************************************") # empty copy_posting_dict self.sorted_posting_dict.clear() self.file_name_list.append('posting' + str(self.file_counter) + '.txt') self.file_counter += 1 # write the corpus to the disk with open('LDA.txt', 'a', encoding='utf-8') as fp: for p in self.LDA_list: s = "" for term in p: s += term + " " fp.write(s + "\n") self.LDA_list.clear() #self.lock.release() time_to_merge = False # create new file of term_dict if self.cur_num_of_tweets == num_of_tweets: # sort the dict self.sorted_term_dict = collections.OrderedDict( sorted(document.term_dict.items())) # make a txt file out of the term_dict with open(self.path + 'posting' + str(self.file_counter) + '.txt', 'w', encoding='utf-8') as fp: for p in self.sorted_term_dict.items(): if len(p[1]) > 1: # more then 2 tweet_id for str1 in p[1]: self.writen_terms += 1 s = p[0] + ":" + str(str1[0]) + "-" + str( str1[1]) + "-100" fp.write(s + "\n") self.file_name_list.append('posting' + str(self.file_counter) + '.txt') # empty sorted_term_dict self.sorted_term_dict.clear() self.file_counter += 1 time_to_merge = True # merge all files to one if time_to_merge: while len(self.file_name_list) > 1: #print(self.file_name_list) self.merge_sorted_files(self.file_name_list[0], self.file_name_list[1]) # remove all files and names os.remove(self.path + self.file_name_list[1]) os.remove(self.path + self.file_name_list[0]) self.file_name_list.remove(self.file_name_list[1]) self.file_name_list.remove(self.file_name_list[0]) # finished making one big posting file self.create_inverted_index(self.file_name_list[0]) # Change all capital letter terms in dict if self.finished_inverted: #config = ConfigClass() to_stem = self.config.get__toStem() for term in document.capital_letter_dict: if document.capital_letter_dict[ term]: # if the term is upper is all corpus if not to_stem: if term.lower() in self.inverted_idx: self.inverted_idx[term] = self.inverted_idx[ term.lower()] del self.inverted_idx[term.lower()] else: stem_term = Stemmer().stem_term(term) if term.lower() != stem_term: if stem_term.lower() in self.inverted_idx: self.inverted_idx[stem_term.upper( )] = self.inverted_idx[stem_term.lower()] else: if stem_term.lower() in self.inverted_idx: self.inverted_idx[stem_term.upper( )] = self.inverted_idx[stem_term.lower()] del self.inverted_idx[term.lower()] if self.finished_inverted: """with open('LDA.txt', 'w', encoding='utf-8') as fp: for p in self.LDA_list: s = "" for term in p: s += term+" " fp.write(s + "\n")""" # read the corpus from file with open('LDA.txt', buffering=2000000, encoding='utf-8') as f: for line in f: sp_line = line.split(" ") self.LDA_list.append(sp_line) os.remove('LDA.txt') # add long term into LDA list for term in document.term_dict: for ID in document.term_dict[term]: if ID[1] > 1: tweet_id = ID[0] if tweet_id in self.tweet_line_dict: index = self.tweet_line_dict[tweet_id] self.LDA_list[index].append(term) # empty term_dict document.term_dict.clear() self.lda = LDA_ranker(self.LDA_list) # start LDA ranker # empty LDA_list #self.LDA_list.clear() self.lda.create_corpus()