Пример #1
0
class Indexer:
    def __init__(self, config, all_terms_dict):
        self.inverted_idx = all_terms_dict
        #self.postingDict = {}
        self.fileName = 'InvertedIndex'
        self.config = config
        # {term: [ordered list where appear : (file_id , lineNumber)]}
        self.thread_pool_size = 1
        avg_ram = (psutil.virtual_memory().available //
                   10) // (self.thread_pool_size + 1)
        path = 'MapReduceData/'
        self.avg_length = (avg_ram // sys.getsizeof(
            (int(), str()))) // (8 / 10)
        # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
        self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'AG/')
        self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'HQ/')
        self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'Rz/')
        self.map_reduce_other = MapReduce(self.avg_length,
                                          self.thread_pool_size,
                                          path + 'Others/')
        self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size,
                                        path + 'Document/')
        # self.tmp_pos = {}
        # self.num_in_pos_tmp = 0
        self.num_in_pos_ag_tmp = [0]
        self.num_in_pos_hq_tmp = [0]
        self.num_in_pos_rz_tmp = [0]
        self.num_in_pos_other_tmp = [0]
        self.num_in_pos_doc = [0]
        self.Entitys = {}
        self.tmp_pos_ag = {}
        self.tmp_pos_hq = {}
        self.tmp_pos_rz = {}
        self.tmp_pos_other = {}
        self.tmp_pos_doc = {}
        # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
        self.NUMBER_OF_PROCESSES = 5
        self.set_is_writting = {}

    def get_right_tmp_pos_and_num(self, first_letter):
        lower_letter = str(first_letter).lower()
        if 'a' <= lower_letter <= 'g':
            return [
                self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag,
                'ag'
            ]
        elif 'h' <= lower_letter <= 'q':
            return [
                self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq,
                'hq'
            ]
        elif 'r' <= lower_letter <= 'z':
            return [
                self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz,
                'rz'
            ]
        return [
            self.tmp_pos_other, self.num_in_pos_other_tmp,
            self.map_reduce_other, 'others'
        ]

    def save_left_over(self, dict, map_reduce):
        map_reduce.write_dict(dict)
        map_reduce.wait_untill_finish()

    def save_all_map_reduce(self):
        with ProcessPoolExecutor() as process_exector:
            process_exector.map(self.map_reduce_ag.save_map_reduce())
            process_exector.map(self.map_reduce_hq.save_map_reduce())
            process_exector.map(self.map_reduce_rz.save_map_reduce())
            process_exector.map(self.map_reduce_other.save_map_reduce())
            process_exector.map(self.map_reduce_doc.save_map_reduce())

    def check_save_left_over_ag(self):
        if self.num_in_pos_ag_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag)
            self.num_in_pos_ag_tmp[0] = 0
            self.map_reduce_ag.wait_untill_finish()

    def check_save_left_over_hq(self):
        if self.num_in_pos_hq_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq)
            self.num_in_pos_hq_tmp[0] = 0
            self.map_reduce_hq.wait_untill_finish()

    def check_save_left_over_rz(self):
        if self.num_in_pos_rz_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz)
            self.num_in_pos_rz_tmp[0] = 0
            self.map_reduce_rz.wait_untill_finish()

    def check_save_left_over_others(self):
        if self.num_in_pos_other_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_other, self.map_reduce_other)
            self.num_in_pos_other_tmp[0] = 0
            self.map_reduce_other.wait_untill_finish()

    def check_save_left_over_doc(self):
        if self.num_in_pos_doc[0] > 0:
            self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc)
            self.num_in_pos_doc[0] = 0
            self.map_reduce_doc.wait_untill_finish()

    def save_all_left_overs(self):
        with ProcessPoolExecutor() as process_exector:
            process_exector.map(self.check_save_left_over_ag())
            process_exector.map(self.check_save_left_over_hq())
            process_exector.map(self.check_save_left_over_rz())
            process_exector.map(self.check_save_left_over_others())
            process_exector.map(self.check_save_left_over_doc())

    def print_meta_data_len(self):
        print('________________________________________________')
        print('Ag:' + str(len(self.map_reduce_ag.meta_data)))
        print('HG:' + str(len(self.map_reduce_hq.meta_data)))
        print('RZ:' + str(len(self.map_reduce_rz.meta_data)))
        print('OTHER:' + str(len(self.map_reduce_other.meta_data)))
        print('Doc:' + str(len(self.map_reduce_doc.meta_data)))
        print('________________________________________________')
        # return len(self.map_reduce_ag.meta_data) + len(self.map_reduce_hq.meta_data) + len(self.map_reduce_rz.meta_data) + len(self.map_reduce_other.meta_data) + len(self.map_reduce_ag.meta_data)

    def addEntitysToPosting(self, term, tweet_id, quantity):
        str_term = str(term)
        first_letter = str_term[0]
        tmp_pos, number_arr, map_reduce, _ = self.get_right_tmp_pos_and_num(
            first_letter)
        # first time seeing post (might be in)
        if term.upper() not in self.Entitys.keys() and term.upper(
        ) not in tmp_pos.keys() and term.lower(
        ) not in map_reduce.meta_data.keys():
            self.Entitys[term.upper()] = (tweet_id, quantity)
        else:
            if term.upper() not in tmp_pos.keys():
                tmp_pos[term.upper()] = []
            if term.upper() in self.Entitys.keys():  # secound time seeing it
                self.inverted_idx[term.upper()] = 2
                tmp_pos[term.upper()].append(self.Entitys[term.upper()])
                tmp_pos[term.upper()].append((tweet_id, quantity))
                number_arr[0] += 2
                self.inverted_idx[term.upper()] = 2
            else:
                self.inverted_idx[term.upper()] += 1
                tmp_pos[term.upper()].append((tweet_id, quantity))
                number_arr[0] += 1

    def add_new_doc(self, document):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """
        document_dictionary = document.term_doc_dictionary  #{term:freq,term:freq}
        term_lst = [*document_dictionary]
        term_lst.sort(key=lambda x: x.lower())
        for i in range(len(term_lst)):
            term = term_lst[i]
            tmp_pos, number_arr, map_reduce, key = self.get_right_tmp_pos_and_num(
                term[0])
            try:
                if term[0].isupper() and " " in term:
                    self.addEntitysToPosting(term, document.tweet_id,
                                             document_dictionary[term])
                    continue
                if number_arr[0] >= self.avg_length:
                    map_reduce.write_dict(tmp_pos)
                    self.set_is_writting[key] = 1
                    number_arr[0] = 0
                if key in self.set_is_writting.keys():
                    print('Waiting to write to end ')
                    map_reduce.wait_untill_finish()
                    print('Done waiting')
                    del self.set_is_writting[key]
                if term.lower() not in tmp_pos.keys():
                    tmp_pos[term.lower()] = []
                tmp_pos[term.lower()].append(
                    (document.tweet_id, document_dictionary[term]))
                number_arr[0] += 1
            except:
                print('INVERTED: problem with the following key {}'.format(
                    term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc[0] += 1
        if self.num_in_pos_doc[0] >= self.avg_length:
            if 'doc' not in self.set_is_writting.keys():
                self.map_reduce_doc.write_dict(self.tmp_pos_doc)
                self.set_is_writting['doc'] = 1
            else:
                self.map_reduce_doc.wait_untill_finish()
                del self.set_is_writting['doc']
                self.num_in_pos_doc[0] = 0
Пример #2
0
class Indexer:
    def __init__(self, config, all_terms_dict):
        self.inverted_idx = all_terms_dict
        #self.postingDict = {}
        self.fileName = 'InvertedIndex'
        self.config = config
        # {term: [ordered list where appear : (file_id , lineNumber)]}
        self.thread_pool_size = 2
        avg_ram = (psutil.virtual_memory().available //
                   self.thread_pool_size) // 10
        path = 'MapReduceData/'
        self.avg_length = (avg_ram // sys.getsizeof(
            (int(), str()))) // (8 / 10)
        # self.map_reduce = MapReduce(self.avg_length,self.thread_pool_size)
        self.map_reduce_ag = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'AG/')
        self.map_reduce_hq = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'HQ/')
        self.map_reduce_rz = MapReduce(self.avg_length, self.thread_pool_size,
                                       path + 'Rz/')
        self.map_reduce_other = MapReduce(self.avg_length,
                                          self.thread_pool_size,
                                          path + 'Others/')
        self.map_reduce_doc = MapReduce(self.avg_length, self.thread_pool_size,
                                        path + 'Document/')
        self.tmp_pos = {}
        # self.num_in_pos_tmp = 0
        self.num_in_pos_ag_tmp = [0]
        self.num_in_pos_hq_tmp = [0]
        self.num_in_pos_rz_tmp = [0]
        self.num_in_pos_other_tmp = [0]
        self.num_in_pos_doc_other = [0]
        self.Entitys = {}
        self.tmp_pos_ag = {}
        self.tmp_pos_hq = {}
        self.tmp_pos_rz = {}
        self.tmp_pos_other = {}
        self.tmp_pos_doc = {}
        # self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
        self.NUMBER_OF_PROCESSES = 5

    def get_right_tmp_pos_and_num(self, first_letter):
        lower_letter = str(first_letter).lower()
        if 'a' <= lower_letter <= 'g':
            return [
                self.tmp_pos_ag, self.num_in_pos_ag_tmp, self.map_reduce_ag
            ]
        elif 'h' <= lower_letter <= 'q':
            return [
                self.tmp_pos_hq, self.num_in_pos_hq_tmp, self.map_reduce_hq
            ]
        elif 'r' <= lower_letter <= 'z':
            return [
                self.tmp_pos_rz, self.num_in_pos_rz_tmp, self.map_reduce_rz
            ]
        return [
            self.tmp_pos_other, self.num_in_pos_other_tmp,
            self.map_reduce_other
        ]

    def wait_untill_all_finish(self):
        self.map_reduce_ag.wait_untill_finish()
        self.map_reduce_hq.wait_untill_finish()
        self.map_reduce_rz.wait_untill_finish()
        self.map_reduce_other.wait_untill_finish()
        self.map_reduce_doc.wait_untill_finish()

    def save_left_over(self, dict, map_reduce):
        map_reduce.write_dict_func(dict)

    def check_save_left_over_ag(self):
        if self.num_in_pos_ag_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_ag, self.map_reduce_ag)
            self.num_in_pos_ag_tmp[0] = 0

    def check_save_left_over_hq(self):
        if self.num_in_pos_hq_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_hq, self.map_reduce_hq)
            self.num_in_pos_hq_tmp[0] = 0

    def check_save_left_over_rz(self):
        if self.num_in_pos_rz_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_rz, self.map_reduce_rz)
            self.num_in_pos_rz_tmp[0] = 0

    def check_save_left_over_others(self):
        if self.num_in_pos_other_tmp[0] > 0:
            self.save_left_over(self.tmp_pos_other, self.map_reduce_other)
            self.num_in_pos_other_tmp[0] = 0

    def check_save_left_over_doc(self):
        if self.num_in_pos_doc_other[0] > 0:
            self.save_left_over(self.tmp_pos_doc, self.map_reduce_doc)
            self.num_in_pos_doc_other[0] = 0

    def save_all_left_overs(self):
        # self.check_save_left_over_ag()
        # self.check_save_left_over_doc()
        # self.check_save_left_over_hq()
        # self.check_save_left_over_rz()
        # self.check_save_left_over_others()
        with ProcessPoolExecutor() as process_exector:
            process_exector.submit(self.check_save_left_over_ag())
            process_exector.submit(self.check_save_left_over_hq())
            process_exector.submit(self.check_save_left_over_rz())
            process_exector.submit(self.check_save_left_over_others())
            process_exector.submit(self.check_save_left_over_doc())

    def add_entitys_to_posting(self, term, tweet_id, quantity):
        first_letter = term[0]
        tmp_pos, number_arr, _ = self.get_right_tmp_pos_and_num(first_letter)
        if term.upper() not in self.Entitys.keys() and term.upper(
        ) not in tmp_pos.keys():
            self.Entitys[term.upper()] = (tweet_id, quantity)
        else:
            if term.upper() not in self.inverted_idx.keys():
                self.inverted_idx[term.upper()] = 2
            else:
                self.inverted_idx[term.upper()] += 1
            if term.upper() not in tmp_pos.keys():
                tmp_pos[term.upper()] = []
                tmp_pos[term.upper()].append(self.Entitys[term.upper()])
                del self.Entitys[term.upper()]
            tmp_pos[term.upper()].append((tweet_id, quantity))

    def add_new_doc(self, document):
        """
        This function perform indexing process for a document object.
        Saved information is captures via two dictionaries ('inverted index' and 'posting')
        :param document: a document need to be indexed.
        :return: -
        """
        document_dictionary = document.term_doc_dictionary  #{term:freq,term:freq}
        term_lst = [*document_dictionary]
        term_lst.sort(key=lambda x: x.lower())
        for i in range(len(term_lst)):
            term = term_lst[i]
            tmp_pos, number_arr, map_reduce = self.get_right_tmp_pos_and_num(
                term[0])
            try:
                if term[0].isupper() and " " in term:
                    self.add_entitys_to_posting(term, document.tweet_id,
                                                document_dictionary[term])
                    continue
                if number_arr[0] >= self.avg_length:
                    map_reduce.write_dict_func(tmp_pos)
                    number_arr[0] = 0
                if term.lower() not in self.tmp_pos.keys():
                    tmp_pos[term.lower()] = []
                tmp_pos[term.lower()].append(
                    (document.tweet_id, document_dictionary[term]))
                number_arr[0] += 1
            except:
                print('problem with the following key {}'.format(term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc_other[0] += 1
        if self.num_in_pos_doc_other[0] >= self.avg_length:
            self.map_reduce_doc.write_dict_func(self.tmp_pos_doc)
            self.num_in_pos_doc_other[0] = 0