class UserParser: def __init__(self, parser_output_dir): self.message_parser = MessageParser() self.parser_output_dir = parser_output_dir def parse_user(self, user_directory): user_name = os.path.basename(user_directory) user_folders = UserParser.__list_folders(user_directory) user_contents = {} for folder in user_folders: current_directory_name = os.path.basename(folder) folder_contents = self.__parse_user_folder(os.path.join(folder)) user_contents[current_directory_name] = folder_contents user_dict = {user_name: user_contents} save_path = os.path.join(os.getcwd(), self.parser_output_dir, user_name + ".gz") save_to_disk_util.save_to_disk(save_path, user_dict) def __parse_user_folder(self, user_folder): messages = {} message_paths = [ message_path for message_path in glob.glob(os.path.join(user_folder, "**"), recursive=True) if not os.path.isdir(message_path) ] for message_path in message_paths: message_filename = os.path.basename(message_path) try: subdirectory_path = UserParser.__get_subdirectory_path( message_path, user_folder) message_filename_key = subdirectory_path + "/" + message_filename message_contents = self.message_parser.parse_message( message_path, user_folder) messages[message_filename_key] = message_contents except UnicodeDecodeError: print("Could not parse '" + message_path + "'\n") return messages @staticmethod def __get_subdirectory_path(full_path, base_dir): return os.path.dirname(full_path).replace(base_dir, '') @staticmethod def __list_folders(directory): return [ os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d)) ]
class TelegramIndexer: def __init__(self): self.parser = MessageParser() self.visited_links = set() # to keep track of links that we are going to visit during next iteration self.links_to_visit = set() # to keep track of links that we are traveling through during current iteration self.index = {} self.doc_lengths = {} self.database_empty = True # create logger if not os.path.exists('../logs'): os.makedirs('../logs') self.logger = logging.getLogger("urls_extractor") self.logger.setLevel(logging.INFO) fh = logging.FileHandler("../logs/indexer.log") formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) self.logger.addHandler(fh) try: client = MongoClient("mongodb://127.0.0.1:27017/") self.logger.info('Connected to MongoDB successfully!!') except: self.logger.error('Could not connect to MongoDB') self.database = client.TelegramIndexerDB #client.TelegramMusicIndexerDB def index_one_url(self, url, messages): if url in self.visited_links: self.links_to_visit.discard(url) self.logger.info(f'Url {url} was already indexed. Move on to another url') return if not len(messages): self.links_to_visit.discard(url) self.logger.info(f'Message list is empty for url {url}') return self.logger.info(f'Indexing messages from url {url}') for msg_url, msg_text in messages.items(): words, links = self.parser.parse_message(msg_text) self.links_to_visit.update(links) self.links_to_visit -= self.visited_links self.doc_lengths[msg_url] = len(words) # not precise but its okay # add words to index for w in words.keys(): msg_freq = words[w] # how many times occur in this particular message if w not in self.index: self.index[w] = [msg_freq, (msg_url, msg_freq)] else: self.index[w][0] += msg_freq self.index[w].append((msg_url, msg_freq)) self.visited_links.add(url) self.links_to_visit.discard(url) def dump_index(self): self.logger.info(f'Index is being dumped to DB') if self.database_empty: cursor = self.database.Index.find() i = 0 for record in cursor: i += 1 break if i > 0: self.database_empty = False if self.database_empty: self.logger.info(f'Database was empty, writing {len(self.index)} new items') for word, postings in self.index.items(): try: self.database.Index.insert_one( {'key': word, 'frequency': postings[0], 'postings': postings[1:]} ) except: self.logger.error(f'Unable to add new items to Index in database') for msg_url, doc_len in self.doc_lengths.items(): try: self.database.DocLengths.insert_one( {'doc_url':msg_url, 'length': doc_len} ) except: self.logger.error('Unable to add new items to DocLengths in database') self.index = {} # local index is gonna be empty self.doc_lengths = {} return # Else, we have to merge new changes to existing index self.logger.info(f'Updating Index in database with {len(self.index)} new items') for word, postings in self.index.items(): cursor = self.database.Index.find({'key': word}) # 1. get existing index from db db_index = {} for record in cursor: db_index = record if not len(db_index): self.database.Index.insert_one( {'key': word, 'frequency': postings[0], 'postings': postings[1:]} ) else: self.logger.info(f'Changing existing postings') db_postings = db_index['postings'] db_postings = {u: f for u, f in db_postings} for doc_url, doc_freq in postings[1:]: db_postings[doc_url] = doc_freq db_postings = [[u, f] for u, f in db_postings.items()] frequency = 0 for _, freq in db_postings: frequency += freq myquery = {'key': word} newvalues = {"$set": {'frequency': frequency, 'postings': db_postings}} try: self.database.Index.update_one(myquery, newvalues) self.logger.info('Postings changed successfully') except: self.logger.error('Postings were not changes, error while writing to database') self.logger.info('Updating DocLengths in the database') for doc_url, doc_len in self.index.items(): cursor = self.database.Index.find({'doc_url': doc_url}) # 1. get existing lengths from db db_index = {} for record in cursor: db_index = record if not len(db_index): self.database.DocLengths.insert_one( {'doc_url': doc_url, 'length': doc_len} ) else: self.logger.info(f'Changing doc lenths') myquery = {'doc_url': doc_url} newvalues = {"$set": {'length': doc_len}} try: self.database.DocLengths.update_one(myquery, newvalues) self.logger.info('DocLengths changed successfully') except: self.logger.error('DocLengths were not changes, error while writing to database')
class Search: def __init__(self): self.parser = MessageParser() try: client = MongoClient("mongodb://127.0.0.1:27017/") print('Connected to MongoDB successfully!!') except: print('Could not connect to MongoDB') self.database = client.TelegramIndexerDB # client.TelegramMusicIndexerDB # define separate logger for searcher and bot logger if not os.path.exists('../../logs'): os.makedirs('../../logs') self.logger = logging.getLogger("searcher") self.logger.setLevel(logging.INFO) fh = logging.FileHandler("../../logs/user_search.log") formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) self.logger.addHandler(fh) def search(self, query): query_terms, links = self.parser.parse_message(query) # now organize the searcher in the index # relevant_documents = self.boolean_retrieval(query_terms) relevant_documents = self.okapi_scoring(query_terms) return relevant_documents def boolean_retrieval(self, query): # 1. first get data from index in database postings = [] for term in query.keys(): posting = [] cursor = self.database.Index.find({'key': term}) for record in cursor: posting += record['postings'] # extract document info only posting = [i[0] for i in posting] postings.append(posting) if not len(postings): return [] docs = list(set.intersection(*map(set, postings))) return docs def okapi_scoring(self, query, k1=1.2, b=0.75): scores = Counter() N = self.database.Index.count() avgdl = 100 # constant for all documents, not gonna calculate for term in query.keys(): # extract postings lists from Index postings = [] cursor = self.database.Index.find({'key': term}) for record in cursor: postings += record['postings'] if not len(postings): continue # ignore absent terms # if term is present in the database, then we calculate okapi # score for each document n_docs = len(postings) - 1 idf = math.log10((N - n_docs + 0.5) / (n_docs + 0.5)) for posting in postings: doc_id = posting[0] doc_tf = posting[1] doc_len = 0 cursor = self.database.DocLengths.find({'doc_url':doc_id}) for record in cursor: doc_len = record['length'] if not doc_len: doc_len = 0 score = idf * doc_tf * (k1 + 1) / (doc_tf + k1 * ( 1 - b + b * (doc_len / avgdl))) scores[doc_id] += score # sort according to the score value scores = scores.most_common() documents = [doc_url for doc_url, _ in scores] return documents