def test_export(self, _get_all_domains: MagicMock, open_mock: MagicMock): _get_all_domains.return_value = ["domain.com", "email.domain.com"] open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE main.export(datetime(2020, 1, 1)) assert open_mock.return_value.__enter__.return_value.write.call_count == 1 assert "<apps:property name='from' value='@domain.com OR @email.domain.com'/>" in \ open_mock.return_value.__enter__.return_value.write.call_args_list[0][0][0]
def test_export_uses_unique_ids_for_filters(self, _get_all_domains: MagicMock, open_mock: MagicMock): _get_all_domains.return_value = [str(x) for x in range(70 * 3)] open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE main.export(datetime(2020, 1, 1)) output: str = open_mock.return_value.__enter__.return_value.write.call_args_list[ 0][0][0] id_matches: List[str] = re.findall(r"<id>(.+)<\/id>", output) ids: List[str] = [str(x) for x in id_matches] unique_ids = set(ids) assert len(ids) == len(unique_ids)
def test_export_split_in_chuncks_of_70(self, _get_all_domains: MagicMock, open_mock: MagicMock): expected_chunks = 3 _get_all_domains.return_value = [ str(x) for x in range(70 * expected_chunks) ] open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE main.export(datetime(2020, 1, 1)) open_mock.return_value.__enter__.return_value.write.call_args_list[0][ 0][0]: str assert open_mock.return_value.__enter__.return_value.write.call_args_list[ 0][0][0].count("<apps:property name='from'") == expected_chunks
def compare_words(path: str, out_path: str, percent_threshold: int): data = pd.read_json(path) words = [word for word in data.word] res = [] print(f'started comparing: {datetime.now()}') while words: word = words.pop(0) for w in words: ratio = fuzz.ratio(word, w) if ratio >= percent_threshold: res.append((word, w, ratio)) res = pd.DataFrame(res, columns=['word1', 'word2', 'percent']) res = res.sort_values(by='percent', ascending=False) export(res, os.path.join(OUTPUT_FOLDER, out_path))
def create_file_with_category(path: str, compression: str, query: str, out_file_name: str, chunk_size: int = 10**6): funcs = [] opts = {'lines': True, 'chunksize': chunk_size} if compression: opts['compression'] = compression with mp.Pool(mp.cpu_count()) as pool, pd.read_json(path, **opts) as reader: for idx, chunk in enumerate(reader, 1): print(f'processing chunk #{idx}: {datetime.datetime.now()}') funcs.append(pool.apply_async(process_chunk, [chunk, query])) print(f'concatenating chunks') res = tuple(zip(func.get() for func in funcs)) export(pd.concat(res[0]), os.path.join(OUTPUT_FOLDER, out_file_name)) export(pd.concat(res[1]), os.path.join(OUTPUT_FOLDER, f'id_title_{out_file_name}'))
def create_chunk_files_with_category(path: str, compression: str, query: str, out_file_name: str, chunk_size: int = 10**6): opts = {'lines': True, 'chunksize': chunk_size} if compression: opts['compression'] = compression with mp.Pool(mp.cpu_count()) as pool, pd.read_json(path, **opts) as reader: for idx, chunk in enumerate(reader, 1): print(f'processing chunk #{idx}: {datetime.datetime.now()}') func = pool.apply_async(process_chunk, [chunk, query]) res = func.get() _id = uuid.uuid4() export(res[0], os.path.join(OUTPUT_FOLDER, f'{_id}_{out_file_name}')) export( res[1], os.path.join(OUTPUT_FOLDER, f'{_id}_id_title_{out_file_name}'))
def export(self): print("exporttt") self.array.append("footer") main.export(self.array, "Project 1")