예제 #1
0
 def _check_target_folder(self):
     if not os.path.isdir(self._target_file_folder):
         os.makedirs(self._target_file_folder)
         _print(4,
                'INFO: target folder created =' + self._target_file_folder)
     elif len(os.listdir(self._target_file_folder)) > 0:
         raise folder_exception(
             "target file folder is not empty. folder name=" +
             self._target_file_folder)
예제 #2
0
 def _check_global_mapping_folder(self):
     if not os.path.isdir(self._global_mapping_folder):
         os.makedirs(self._global_mapping_folder)
         _print(
             4,
             'INFO: global mapping folder created only for first time execution ='
             + self._global_mapping_folder)
     else:
         _print(
             4, 'INFO: global mapping folder exist, folder name =' +
             self._global_mapping_folder)
예제 #3
0
    def _global_unique(self):
        _local_duplicated_mapping = self._job_controller._global_mapping.merge_dict(
            self._local_mapping)
        #v1.3, for POC purpose, we not need flush the global mapping object into disk every time
        #self._job_controller._global_mapping._write_to_disk()

        if len(_local_duplicated_mapping) > 0:
            _print(
                4, 'WARNING: duplication found in global, number=' +
                len(_local_duplicated_mapping))
        else:
            shutil.move(self._working_data_file_fullname,
                        self._target_data_file_fullname)
            _print(
                4,
                'INFO: no hash conflict, simply move the hashed data file, from='
                + self._working_data_file_fullname + ',\t to=' +
                self._target_data_file_fullname)
예제 #4
0
    def _write_to_disk(self):

        _print(
            4, 'INFO: start writing global mapping file =' +
            self._global_mapping_file_fullname)
        _global_mapping_file_handler = open(self._global_mapping_file_fullname,
                                            'w',
                                            encoding='utf-8',
                                            newline='')
        _CSVWriter = csv.writer(_global_mapping_file_handler,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        for SPI_original in self._mapping:
            _CSVWriter.writerow(
                (SPI_original, self._mapping[SPI_original], ''))
        _global_mapping_file_handler.close()
        _print(
            4, 'INFO: global mapping file written, row count =' +
            str(len(self._mapping)))
예제 #5
0
    def merge_dict(self, local_mapping_dict):
        _print(
            4, 'INFO: before merge, global mapping row count =' +
            str(len(self._mapping)) + ',\t local mapping row count =' +
            str(len(local_mapping_dict)))
        _local_duplicated_mapping = {}
        added_count = 0
        for SPI_original in local_mapping_dict:
            result = self._add_hash_entry(SPI_original,
                                          local_mapping_dict[SPI_original])
            if result[0] > 0:
                added_count += 1
            if result[0] == 2:
                _local_duplicated_mapping[SPI_original] = result[1]

        _print(
            4, 'INFO: after merge, global mapping row count =' +
            str(len(self._mapping)) + ',\t new key added to global =' +
            str(added_count) + ',\t duplicated row count in local =' +
            str(len(_local_duplicated_mapping)))
        return _local_duplicated_mapping
예제 #6
0
    def _read_from_disk(self):
        """
        file has 3 fields:
        key, original_hash_value, extend_hash_value(optional)
        only when value met conflict, we use extend_hash_value. in POC we not detect any conflict
        """
        _print(
            4, 'INFO: start loading global mapping file =' +
            self._global_mapping_file_fullname)
        if os.path.isfile(self._global_mapping_file_fullname):
            _global_mapping_file_handler = open(
                self._global_mapping_file_fullname,
                'r',
                encoding='utf-8',
                newline='')
            _CSVReader = csv.reader(_global_mapping_file_handler,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

            total_row_count = 0
            extend_row_count = 0
            for data_row in _CSVReader:
                total_row_count += 1
                entry_key = data_row[0]
                entry_value = data_row[1]
                extend_value = data_row[2]
                hash_object = hashlib.sha256(
                    entry_key.encode('utf-8', 'ignore'))
                hash_value = hash_object.hexdigest()
                if entry_value != hash_value:
                    raise value_exception("hash value abnormal, line=" +
                                          str(total_row_count) + ',\t SPI=' +
                                          entry_key)
                if extend_value is not None and extend_value != '':
                    extend_row_count += 1
                    self._mapping[entry_key] = extend_value
                    self._revert_mapping[extend_value] = entry_key
                    if entry_value in self._duplicated_hash_value:
                        self._duplicated_hash_value[entry_value][
                            'conflict_count'] = 1
                    else:
                        self._duplicated_hash_value[entry_value][
                            'conflict_count'] += 1
                    self._duplicated_hash_value[entry_value][
                        entry_key] = extend_value
                else:
                    self._mapping[entry_key] = entry_value
                    self._revert_mapping[entry_value] = entry_key

            _global_mapping_file_handler.close()
            _print(
                4, 'INFO: global mapping file loaded, total row count =' +
                str(total_row_count) + ',\t duplicated value found =' +
                str(extend_row_count))
        else:
            _print(
                4,
                'WARNING: global mapping file not exist. leave the mapping dict as empty.'
            )
예제 #7
0
    def _check_working_folder(self):
        if not os.path.isdir(self._working_folder_data):
            os.makedirs(self._working_folder_data)
            _print(
                4, 'INFO: working folder for data created =' +
                self._working_folder_data)
        else:
            _print(
                4,
                'WARNING: working folder for data already existed, file will be overwritten. folder name ='
                + self._working_folder_data)

        if not os.path.isdir(self._working_folder_mapping):
            os.makedirs(self._working_folder_mapping)
            _print(
                4, 'INFO: working folder for mapping created =' +
                self._working_folder_mapping)
        else:
            _print(
                4,
                'WARNING: working folder for mapping already existed, file will be overwritten. folder name  ='
                + self._working_folder_mapping)
예제 #8
0
 def _scan_source_files(self):
     _print(4, 'INFO: scanning folder =' + self._source_file_folder)
     if not os.path.isdir(self._source_file_folder):
         raise folder_exception("source folder not exist. folder name=" +
                                self._source_file_folder)
     self._source_file_list = []
     for source_file_path in Path(self._source_file_folder).iterdir():
         if source_file_path.is_file():  # only catch files
             self._source_file_list.append(source_file_path)
         else:
             _print(
                 9,
                 'DEBUG: ignore non-file object when scan source folder.')
     _print(4,
            'INFO: total file found =' + str(len(self._source_file_list)))
예제 #9
0
    def _local_hash(self):
        _source_file_handler = open(self._source_file_path,
                                    'r',
                                    encoding='utf-8',
                                    newline='')
        _print(
            4, 'INFO: start local hash for source file =' +
            self._source_file_name)
        _working_data_file_handler = open(self._working_data_file_fullname,
                                          'w',
                                          encoding='utf-8',
                                          newline='')
        _CSVReader = csv.reader(_source_file_handler,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        _CSVWriter = csv.writer(_working_data_file_handler,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        total_row_count = 0
        for data_row in _CSVReader:
            total_row_count += 1
            SPI_original = data_row[1]  # This will pick Field 1 for Hashing
            SPI_original_1 = data_row[2]  # This will pick Field 2 for Hashing
            hash_object = hashlib.sha256(SPI_original.encode(
                'utf-8', 'ignore'))
            hash_object_1 = hashlib.sha256(
                SPI_original_1.encode('utf-8', 'ignore'))
            SPI_hashed = hash_object.hexdigest()
            SPI_hashed_1 = hash_object_1.hexdigest()
            self._local_mapping[SPI_original] = SPI_hashed
            self._local_mapping[SPI_original_1] = SPI_hashed_1
            _CSVWriter.writerow(
                (data_row[0], ) + (SPI_hashed, ) + (SPI_hashed_1, ) +
                tuple(data_row[3:])
            )  # use tuple add here-- Gokul - This will populate only the hashed fields
        _source_file_handler.close()
        _working_data_file_handler.close()
        _print(
            4, 'INFO: local hash data file write to =' +
            self._working_data_file_fullname + ',\t total rows =' +
            str(total_row_count))

        _working_mapping_file_handler = open(
            self._working_mapping_file_fullname,
            'w',
            encoding='utf-8',
            newline='')
        _CSVWriter = csv.writer(_working_mapping_file_handler,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        for SPI_original in self._local_mapping:
            _CSVWriter.writerow(
                (SPI_original, self._local_mapping[SPI_original]))
        _working_mapping_file_handler.close()
        _print(
            4, 'INFO: local hash mapping file write to =' +
            self._working_mapping_file_fullname + ',\t total rows =' +
            str(len(self._local_mapping)))
예제 #10
0
                4, 'INFO: working folder for data created =' +
                self._working_folder_data)
        else:
            _print(
                4,
                'WARNING: working folder for data already existed, file will be overwritten. folder name ='
                + self._working_folder_data)

        if not os.path.isdir(self._working_folder_mapping):
            os.makedirs(self._working_folder_mapping)
            _print(
                4, 'INFO: working folder for mapping created =' +
                self._working_folder_mapping)
        else:
            _print(
                4,
                'WARNING: working folder for mapping already existed, file will be overwritten. folder name  ='
                + self._working_folder_mapping)


####unit test####
if __name__ == '__main__':  # unit testing case

    # initial program
    _print(2, 'INFO: data generator unit testing starting...')
    my_control = controller(source_file_folder, working_folder_data,
                            working_folder_mapping, global_mapping_folder,
                            target_file_folder)

    _print(2, 'INFO: data generator unit testing finished')