def generate_dataset(): with open(WORKS_DISTANCE_DATASET_CMP_FILE, 'w', encoding='utf8') as works_distance_dataset_file_augm: with open(WORKS_DISTANCE_DATASET_FILE, 'w', encoding='utf8') as works_distance_dataset_file: works_distance_dataset = csv.writer(works_distance_dataset_file, lineterminator='\n') works_distance_dataset_augm = csv.writer(works_distance_dataset_file_augm, lineterminator='\n') with open(WORKS_DATASET_FILE, 'r', encoding='utf-8') as works_dataset_file: works_dataset = csv.reader(works_dataset_file) current_row = 0 row1 = None for row in works_dataset: if current_row % 500 == 0: print('Processed %s rows' % current_row) if current_row == 0: document_fields = row works_distance_dataset.writerow(row) works_distance_dataset_augm.writerow(row) row = next(works_dataset) if current_row % 2 == 0: row1 = row else: distance_array = WDDistance.get_scaled_distance_array( row1[:len(row)], row[:len(row)], document_fields, match_strategy) works_distance_dataset_augm.writerow(row1) works_distance_dataset_augm.writerow(row) works_distance_dataset_augm.writerow( distance_array) works_distance_dataset.writerow( distance_array) current_row += 1 if max_rows != -1 and current_row >= max_rows: break
def test_get_fuzzy_distance(self): # print(WDDistance.get_fuzzy_distance('test', 'tset')) # print(WDDistance.get_fuzzy_distance('test and nottest', 'test & nottest')) print(WDDistance.get_fuzzy_distance('Freund', 'FREUND PUBLISHING HOUSE, LTD.')) print(WDDistance.get_fuzzy_distance('nature publishing group', 'elsevier science'))