예제 #1
0
 def test_export(self, _get_all_domains: MagicMock, open_mock: MagicMock):
     _get_all_domains.return_value = ["domain.com", "email.domain.com"]
     open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE
     main.export(datetime(2020, 1, 1))
     assert open_mock.return_value.__enter__.return_value.write.call_count == 1
     assert "<apps:property name='from' value='@domain.com OR @email.domain.com'/>" in \
            open_mock.return_value.__enter__.return_value.write.call_args_list[0][0][0]
예제 #2
0
 def test_export_uses_unique_ids_for_filters(self,
                                             _get_all_domains: MagicMock,
                                             open_mock: MagicMock):
     _get_all_domains.return_value = [str(x) for x in range(70 * 3)]
     open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE
     main.export(datetime(2020, 1, 1))
     output: str = open_mock.return_value.__enter__.return_value.write.call_args_list[
         0][0][0]
     id_matches: List[str] = re.findall(r"<id>(.+)<\/id>", output)
     ids: List[str] = [str(x) for x in id_matches]
     unique_ids = set(ids)
     assert len(ids) == len(unique_ids)
예제 #3
0
 def test_export_split_in_chuncks_of_70(self, _get_all_domains: MagicMock,
                                        open_mock: MagicMock):
     expected_chunks = 3
     _get_all_domains.return_value = [
         str(x) for x in range(70 * expected_chunks)
     ]
     open_mock.return_value.__enter__.return_value.read.return_value = TEMPLATE
     main.export(datetime(2020, 1, 1))
     open_mock.return_value.__enter__.return_value.write.call_args_list[0][
         0][0]: str
     assert open_mock.return_value.__enter__.return_value.write.call_args_list[
         0][0][0].count("<apps:property name='from'") == expected_chunks
def compare_words(path: str, out_path: str, percent_threshold: int):
    data = pd.read_json(path)
    words = [word for word in data.word]
    res = []
    print(f'started comparing: {datetime.now()}')
    while words:
        word = words.pop(0)
        for w in words:
            ratio = fuzz.ratio(word, w)
            if ratio >= percent_threshold:
                res.append((word, w, ratio))
    res = pd.DataFrame(res, columns=['word1', 'word2', 'percent'])
    res = res.sort_values(by='percent', ascending=False)
    export(res, os.path.join(OUTPUT_FOLDER, out_path))
def create_file_with_category(path: str,
                              compression: str,
                              query: str,
                              out_file_name: str,
                              chunk_size: int = 10**6):
    funcs = []
    opts = {'lines': True, 'chunksize': chunk_size}
    if compression:
        opts['compression'] = compression
    with mp.Pool(mp.cpu_count()) as pool, pd.read_json(path, **opts) as reader:
        for idx, chunk in enumerate(reader, 1):
            print(f'processing chunk #{idx}: {datetime.datetime.now()}')
            funcs.append(pool.apply_async(process_chunk, [chunk, query]))

    print(f'concatenating chunks')
    res = tuple(zip(func.get() for func in funcs))
    export(pd.concat(res[0]), os.path.join(OUTPUT_FOLDER, out_file_name))
    export(pd.concat(res[1]),
           os.path.join(OUTPUT_FOLDER, f'id_title_{out_file_name}'))
def create_chunk_files_with_category(path: str,
                                     compression: str,
                                     query: str,
                                     out_file_name: str,
                                     chunk_size: int = 10**6):
    opts = {'lines': True, 'chunksize': chunk_size}
    if compression:
        opts['compression'] = compression
    with mp.Pool(mp.cpu_count()) as pool, pd.read_json(path, **opts) as reader:
        for idx, chunk in enumerate(reader, 1):
            print(f'processing chunk #{idx}: {datetime.datetime.now()}')
            func = pool.apply_async(process_chunk, [chunk, query])
            res = func.get()
            _id = uuid.uuid4()

            export(res[0], os.path.join(OUTPUT_FOLDER,
                                        f'{_id}_{out_file_name}'))
            export(
                res[1],
                os.path.join(OUTPUT_FOLDER, f'{_id}_id_title_{out_file_name}'))
예제 #7
0
 def export(self):
     print("exporttt")
     self.array.append("footer")
     main.export(self.array, "Project 1")