class UserParser:
    def __init__(self, parser_output_dir):
        self.message_parser = MessageParser()

        self.parser_output_dir = parser_output_dir

    def parse_user(self, user_directory):
        user_name = os.path.basename(user_directory)
        user_folders = UserParser.__list_folders(user_directory)

        user_contents = {}
        for folder in user_folders:
            current_directory_name = os.path.basename(folder)
            folder_contents = self.__parse_user_folder(os.path.join(folder))
            user_contents[current_directory_name] = folder_contents

        user_dict = {user_name: user_contents}
        save_path = os.path.join(os.getcwd(), self.parser_output_dir,
                                 user_name + ".gz")

        save_to_disk_util.save_to_disk(save_path, user_dict)

    def __parse_user_folder(self, user_folder):
        messages = {}

        message_paths = [
            message_path
            for message_path in glob.glob(os.path.join(user_folder, "**"),
                                          recursive=True)
            if not os.path.isdir(message_path)
        ]

        for message_path in message_paths:
            message_filename = os.path.basename(message_path)

            try:
                subdirectory_path = UserParser.__get_subdirectory_path(
                    message_path, user_folder)
                message_filename_key = subdirectory_path + "/" + message_filename
                message_contents = self.message_parser.parse_message(
                    message_path, user_folder)

                messages[message_filename_key] = message_contents

            except UnicodeDecodeError:
                print("Could not parse '" + message_path + "'\n")

        return messages

    @staticmethod
    def __get_subdirectory_path(full_path, base_dir):
        return os.path.dirname(full_path).replace(base_dir, '')

    @staticmethod
    def __list_folders(directory):
        return [
            os.path.join(directory, d) for d in os.listdir(directory)
            if os.path.isdir(os.path.join(directory, d))
        ]
class TelegramIndexer:
    def __init__(self):
        self.parser = MessageParser()
        self.visited_links = set()  # to keep track of links that we are going to visit during next iteration
        self.links_to_visit = set()  # to keep track of links that we are traveling through during current iteration
        self.index = {}
        self.doc_lengths = {}
        self.database_empty = True

        # create logger
        if not os.path.exists('../logs'):
            os.makedirs('../logs')
        self.logger = logging.getLogger("urls_extractor")
        self.logger.setLevel(logging.INFO)
        fh = logging.FileHandler("../logs/indexer.log")
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)

        try:
            client = MongoClient("mongodb://127.0.0.1:27017/")
            self.logger.info('Connected to MongoDB successfully!!')
        except:
            self.logger.error('Could not connect to MongoDB')

        self.database = client.TelegramIndexerDB #client.TelegramMusicIndexerDB

    def index_one_url(self, url, messages):
        if url in self.visited_links:
            self.links_to_visit.discard(url)
            self.logger.info(f'Url {url} was already indexed. Move on to another url')
            return

        if not len(messages):
            self.links_to_visit.discard(url)
            self.logger.info(f'Message list is empty for url {url}')
            return

        self.logger.info(f'Indexing messages from url {url}')
        for msg_url, msg_text in messages.items():
            words, links = self.parser.parse_message(msg_text)
            self.links_to_visit.update(links)
            self.links_to_visit -= self.visited_links
            self.doc_lengths[msg_url] = len(words) # not precise but its okay
            # add words to index
            for w in words.keys():
                msg_freq = words[w]  # how many times occur in this particular message
                if w not in self.index:
                    self.index[w] = [msg_freq, (msg_url, msg_freq)]
                else:
                    self.index[w][0] += msg_freq
                    self.index[w].append((msg_url, msg_freq))
        self.visited_links.add(url)
        self.links_to_visit.discard(url)

    def dump_index(self):
        self.logger.info(f'Index is being dumped to DB')
        if self.database_empty:
            cursor = self.database.Index.find()
            i = 0
            for record in cursor:
                i += 1
                break
            if i > 0:
                self.database_empty = False

        if self.database_empty:
            self.logger.info(f'Database was empty, writing {len(self.index)} new items')
            for word, postings in self.index.items():
                try: self.database.Index.insert_one(
                    {'key': word, 'frequency': postings[0], 'postings': postings[1:]}
                )
                except:
                    self.logger.error(f'Unable to add new items to Index in database')

            for msg_url, doc_len in self.doc_lengths.items():
                try:
                    self.database.DocLengths.insert_one(
                        {'doc_url':msg_url, 'length': doc_len}
                    )
                except:
                    self.logger.error('Unable to add new items to DocLengths in database')
            self.index = {}  # local index is gonna be empty
            self.doc_lengths = {}
            return

        # Else, we have to merge new changes to existing index
        self.logger.info(f'Updating Index in database with {len(self.index)} new items')
        for word, postings in self.index.items():
            cursor = self.database.Index.find({'key': word})
            # 1. get existing index from db
            db_index = {}
            for record in cursor:
                db_index = record
            if not len(db_index):
                self.database.Index.insert_one(
                    {'key': word, 'frequency': postings[0], 'postings': postings[1:]}
                )
            else:
                self.logger.info(f'Changing existing postings')
                db_postings = db_index['postings']
                db_postings = {u: f for u, f in db_postings}
                for doc_url, doc_freq in postings[1:]:
                    db_postings[doc_url] = doc_freq
                db_postings = [[u, f] for u, f in db_postings.items()]
                frequency = 0
                for _, freq in db_postings:
                    frequency += freq
                myquery = {'key': word}
                newvalues = {"$set": {'frequency': frequency, 'postings': db_postings}}
                try:
                    self.database.Index.update_one(myquery, newvalues)
                    self.logger.info('Postings changed successfully')
                except:
                    self.logger.error('Postings were not changes, error while writing to database')

        self.logger.info('Updating DocLengths in the database')
        for doc_url, doc_len in self.index.items():
            cursor = self.database.Index.find({'doc_url': doc_url})
            # 1. get existing lengths from db
            db_index = {}
            for record in cursor:
                db_index = record
            if not len(db_index):
                self.database.DocLengths.insert_one(
                    {'doc_url': doc_url, 'length': doc_len}
                )
            else:
                self.logger.info(f'Changing doc lenths')
                myquery = {'doc_url': doc_url}
                newvalues = {"$set": {'length': doc_len}}
                try:
                    self.database.DocLengths.update_one(myquery, newvalues)
                    self.logger.info('DocLengths changed successfully')
                except:
                    self.logger.error('DocLengths were not changes, error while writing to database')
示例#3
0
class Search:
    def __init__(self):
        self.parser = MessageParser()
        try:
            client = MongoClient("mongodb://127.0.0.1:27017/")
            print('Connected to MongoDB successfully!!')
        except:
            print('Could not connect to MongoDB')

        self.database = client.TelegramIndexerDB # client.TelegramMusicIndexerDB

        # define separate logger for searcher and bot logger
        if not os.path.exists('../../logs'):
            os.makedirs('../../logs')

        self.logger = logging.getLogger("searcher")
        self.logger.setLevel(logging.INFO)
        fh = logging.FileHandler("../../logs/user_search.log")
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        self.logger.addHandler(fh)


    def search(self, query):
        query_terms, links = self.parser.parse_message(query)
        # now organize the searcher in the index
        # relevant_documents = self.boolean_retrieval(query_terms)
        relevant_documents = self.okapi_scoring(query_terms)
        return relevant_documents

    def boolean_retrieval(self, query):
        # 1. first get data from index in database
        postings = []
        for term in query.keys():
            posting = []
            cursor = self.database.Index.find({'key': term})
            for record in cursor:
                posting += record['postings']

            # extract document info only
            posting = [i[0] for i in posting]
            postings.append(posting)
        if not len(postings):
            return []
        docs = list(set.intersection(*map(set, postings)))
        return docs

    def okapi_scoring(self, query, k1=1.2, b=0.75):
        scores = Counter()
        N = self.database.Index.count()
        avgdl = 100 # constant for all documents, not gonna calculate
        for term in query.keys():
            # extract postings lists from Index
            postings = []
            cursor = self.database.Index.find({'key': term})
            for record in cursor:
                postings += record['postings']
            if not len(postings):
                continue  # ignore absent terms

            # if term is present in the database, then we calculate okapi
            # score for each document
            n_docs = len(postings) - 1
            idf = math.log10((N - n_docs + 0.5) / (n_docs + 0.5))
            for posting in postings:
                doc_id = posting[0]
                doc_tf = posting[1]
                doc_len = 0
                cursor = self.database.DocLengths.find({'doc_url':doc_id})
                for record in cursor:
                    doc_len = record['length']
                if not doc_len:
                    doc_len = 0
                score = idf * doc_tf * (k1 + 1) / (doc_tf + k1 * (
                            1 - b + b * (doc_len / avgdl)))
                scores[doc_id] += score

        # sort according to the score value
        scores = scores.most_common()
        documents = [doc_url for doc_url, _ in scores]

        return documents