Пример #1
0
class Reducer:
    def __init__(self, hash_maps: list):
        self.hash_maps = hash_maps
        self.reduced_hash_map = HashMap()
        self.top_ten_words = []

    def reduce(self):
        for hash_map in self.hash_maps:
            for bucket in hash_map.table:
                self.add_to_new_hash_table(bucket)

    def add_to_new_hash_table(self, bucket):
        if isinstance(bucket, LinkedList):
            self.add_linked_list_to_new_hash_table(bucket)
        elif isinstance(bucket, tuple):
            hashed_key = self.reduced_hash_map.hash_string(bucket[0])
            self.reduced_hash_map.add_to_hash_table(hashed_key, (bucket[0], 1))

    def add_linked_list_to_new_hash_table(self, bucket: LinkedList):
        bucket_keys = []
        for node in bucket.traverse():
            if node.data[0] in bucket_keys:
                continue
            if isinstance(node.data, tuple):
                bucket_keys = self.add_nodes_to_new_hash_table(
                    bucket_keys, bucket, node)

    def add_nodes_to_new_hash_table(self, bucket_keys: list,
                                    current_bucket: LinkedList,
                                    current_node: Node) -> list:
        current_count = 0
        bucket_keys.append(current_node.data[0])
        for next_node in current_bucket.traverse():
            if next_node.data[0] == current_node.data[0]:
                current_count += 1
        hashed_key = self.reduced_hash_map.hash_string(current_node.data[0])
        self.reduced_hash_map.add_to_hash_table(
            hashed_key, (current_node.data[0], current_count))
        return bucket_keys

    def get_top_ten_words(self):
        top_words_per_bucket = []
        for bucket in self.reduced_hash_map.table:
            if isinstance(bucket, tuple):
                top_words_per_bucket.append(bucket)
            elif isinstance(bucket, LinkedList):
                top_three = [(" ", 0), (" ", 0), (" ", 0)]
                for node in bucket.traverse():
                    top_three.sort(key=itemgetter(1))
                    for index, value in enumerate(top_three):
                        if node.data[1] > value[1]:
                            top_three[index] = node.data
                            break
                top_words_per_bucket.extend(top_three)
        top_words_per_bucket.sort(key=itemgetter(1))
        return top_words_per_bucket[-10:]
Пример #2
0
 def add_words_to_hash_map(self, current_hash_map: HashMap,
                           current_line: str):
     if self.line_count >= 50:
         self.line_count = 0
         self.hash_maps.append(current_hash_map)
         current_hash_map = HashMap()
     split_line = current_line.split(' ')
     if '\n' in split_line:
         split_line.remove('\n')
     if '' in split_line:
         split_line.remove('')
     for word in self.strip_punctuation(split_line):
         if word in self.punctuation:
             continue
         map_index = current_hash_map.hash_string(word)
         current_hash_map.add_to_hash_table(map_index, (word, 1))
     return current_hash_map