示例#1
0
class DictionaryBuilder(BaseBuilder):
    def __init__(self):
        super().__init__()
        self.dictionary = set()
        self.count = 0
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_files(input_dir_path)
        self.build()
        self.save(output_path)
        self.print_counts()

    def build(self):
        for file in self.files:
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                self.count += len(words)
                self.dictionary.update(words)

    def save(self, output_path):
        with open(output_path, 'w') as f:
            f.write('\n'.join(sorted(self.dictionary)))

    def print_counts(self):
        print('Total count: ', self.count)
        print('Dictionary count: ', len(self.dictionary))
示例#2
0
class IdentityMatrixBuilder(BaseBuilder):
    def __init__(self, dict_path):
        super().__init__()
        self.dict_path = dict_path
        self.dictionary = set()
        self.matrix = {}
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_dictionary()
        self.load_files(input_dir_path)
        self.init_matrix()
        self.build()
        self.save(output_path)

    def load_dictionary(self):
        with open(self.dict_path, 'r') as f:
            lines = self.tokenizer.filter_new_lines(f.readlines())
            self.dictionary = sorted(lines)

    def init_matrix(self):
        self.matrix = {x: [0] * len(self.files) for x in self.dictionary}

    def build(self):
        for i, file in enumerate(self.files):
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                for word in words:
                    self.matrix[word][i] += 1

    def save(self, output_path):
        field_names = ['Token'] + list(
            map(lambda x: x.split('/')[-1], self.files))
        with open(output_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(field_names)

            for k, v in self.matrix.items():
                writer.writerow([k] + v)