def get_data(): executor = ThreadPoolExecutor(max_workers=WORKER) # get the res dir ready mkdir_res() # get url list( as_completed( executor.submit(get_chengjiao_house_url, ), executor.submit(get_ershoufang_house_url, ), )) # get ershoufang info list( as_completed( executor.submit(get_ershoufang_house_info, hs) for hs, name in HOUSE_DISTRICT_DICT.items() if not (DATA_DIR / "house_info" / "ershoufang" / f"{hs}.json").is_file())) # get chengjiao info list( as_completed( executor.submit(get_chengjiao_house_info, hs) for hs, name in HOUSE_DISTRICT_DICT.items() if not (DATA_DIR / "house_info" / "chengjiao" / f"{hs}.json").is_file())) # save to csv to_csv()
def run(start_page, end_page): with ThreadPoolExecutor(max_workers=8) as t: begin = time.time() obj_lst = [] for i in range(start_page, end_page + 1): if i == 1: page_url = 'http://pic.netbian.com/index.html' else: page_url = f'http://pic.netbian.com/index_{i}.html' obj = t.submit(craw_detail, page_url) obj_lst.append(obj) for future in as_completed(obj_lst): data = future.result() # print(data) imgurl_lst.extend(data) print(imgurl_lst) print(len(imgurl_lst)) print(f'耗时:{time.time() - begin}') begin = time.time() for imgurl in imgurl_lst: obj = t.submit(download, imgurl) obj_lst.append(obj) for future in as_completed(obj_lst): data = future.result() print(f'耗时:{time.time() - begin}')
def RunCOESim(generation, organisms): global currentGen currentGen = generation resultPath = os.path.join( baseResultPath, f"{scenario}-{generation}" if not scenario == "" else str(generation)) # get only the distinct organisms to simulate in each generation otherwise we will run into problems eventually # has to be done sequentially annoyingly orgsToSimulate = [] for organism in organisms: if organism not in orgsToSimulate: orgsToSimulate.append(organism) # Do some DP stuff so we can potentially cut down on amount of actual simulation if not generation == 0: with open(os.path.join(resultPath, "../GAResults.json"), "r") as f: resJson = json.load(f) with ThreadPoolExecutor(max_workers=threads) as executor: sims = { executor.submit(FindPreviousOrganisms, organism, generation, resJson, resultPath, scenario, baseResultPath): organism for organism in orgsToSimulate } for result in as_completed(sims): if debugOutput: print( f"Found match in previous generation: {result.result()}" ) outputQ = [] with ThreadPoolExecutor(max_workers=threads) as executor: sims = { executor.submit(defineRunAndEvaluateSimulation, parsedMMJson, scenario, organism, dseConfig, resultPath, basePath, threads > 1, coeConfig, debugOutput): organism for organism in orgsToSimulate } for result in as_completed(sims): outputQ.append(result.result()) addSimulationDirToRankingFileThreaded(outputQ, resultPath)
def launch(self): self._tonglian_init() self._yuqing_init() max_id, min_id = self.select_max_title_id() print("news_id 的范围是: ", min_id, max_id) for i in range(min_id // self.batch_num, max_id // self.batch_num + 1): news_id_start = self.batch_num * i news_id_end = self.batch_num * (i + 1) print("当前范围是: ", news_id_start, news_id_end) sql = '''select T.NEWS_ID, T.NEWS_PUBLISH_TIME, T.NEWS_TITLE, T.NEWS_PUBLISH_SITE, B.NEWS_BODY \ from vnews_content_v1 T, vnews_body_v1 B \ where T.NEWS_ID >= {} and T.NEWS_ID <= {} \ and B.NEWS_ID >= {} and B.NEWS_ID <= {} \ and T.NEWS_ID = B.NEWS_ID \ and T.NEWS_PUBLISH_TIME between '{}' and '{}'; '''.format( news_id_start, news_id_end, news_id_start, news_id_end, self.start_time, self.end_time) print("sql: ", sql) datas = self.tonglian_client.select_all(sql) print("当前数据量是: ", len(datas)) items = [] with ThreadPoolExecutor(max_workers=10) as t: res = [t.submit(self.post_api, data) for data in datas] for future in as_completed(res): item = future.result() if item: # print(">>> ", item) items.append(item) print(len(items)) self._batch_save(self.yuqing_client, items, self.target_table_name, self.target_fields) self.yuqing_client.end()
def launch2(self): self._tonglian_init() self._yuqing_init() dt = self.start_time while dt <= self.end_time: end_dt = dt + datetime.timedelta(days=1) sql = '''select T.NEWS_ID, T.NEWS_URL, T.NEWS_ORIGIN_SOURCE, T.NEWS_PUBLISH_TIME, T.NEWS_TITLE, T.NEWS_PUBLISH_SITE, B.NEWS_BODY \ from vnews_content_v1 T, vnews_body_v1 B \ where T.NEWS_PUBLISH_TIME between '{}' and '{}' \ and T.NEWS_ID = B.NEWS_ID; '''.format(dt, end_dt) print(sql) datas = self.tonglian_client.select_all(sql) print("当前数据量是: ", len(datas)) items = [] with ThreadPoolExecutor(max_workers=10) as t: res = [t.submit(self.post_api, data) for data in datas] for future in as_completed(res): item = future.result() if item: items.append(item) print(len(items)) self._batch_save(self.yuqing_client, items, self.target_table_name, self.target_fields) self.yuqing_client.end() dt = end_dt
def extract_all_fuz(cleaned_output: Path, work_dir: Path): """ Extracts FUZ files into XWM files. """ fuz_dir = work_dir / "extracted" xwa_dir = work_dir / "wav" # This is entirely I/O based so it's a good candidate to run it in parallel. candidates = [] with cleaned_output.open(mode="r", newline="") as f: reader = csv.DictReader(f) for line in reader: infile = fuz_dir / line["original_path"].lower() infile = infile.with_suffix(".fuz") outfile = xwa_dir / line["original_path"].lower() outfile = outfile.with_suffix(".xwm") outfile.parent.mkdir(parents=True, exist_ok=True) candidates.append((infile, outfile)) count = 0 with ThreadPoolExecutor() as exe: futures = [exe.submit(lambda tp: extract_fuz(*tp), i) for i in candidates] for i in as_completed(futures): count += 1 if count % 100 == 0: print(f"Extracted {count} files...")
def main(): Q = QiniuProvider() countries = parseTable( Link(f'{domain}/wiki/List_of_IOC_country_codes').getText()) beginTime = datetime.now() with ThreadPoolExecutor(max_workers=10) as pool: allTasks = [] sqlFile = open('countryList.sql', 'w', encoding="utf-8") sqlWriter = SQLExporter(sqlFile, 'nationality', ['name', 'code', 'flag']) for country in countries: print(f'graping {country.name}...') allTasks.append( pool.submit( executor, ExecutorParams(q=Q, country=country, writer=sqlWriter))) for task in as_completed(allTasks): print(f'{task.result()} downloaded.') sqlFile.close() endTime = datetime.now() print(f'run time: {endTime - beginTime}')
def generate_files_multithreaded(self, query, query_right=None): query_to_run_left = query query_to_run_right = query if query_right: # if only one query is supplied, run the same query on both connections query_to_run_right = query_right # if a second query is supplied for the right side, set it here. left_stx = SqlToXl(self.left_connection_string) right_stx = SqlToXl(self.right_connection_string) futures = [] with ThreadPoolExecutor(max_workers=2) as executor: futures.append( executor.submit( left_stx.save_sql, *[query_to_run_left, self.left_file_path, self.left_sheet])) futures.append( executor.submit( right_stx.save_sql, *[ query_to_run_right, self.right_file_path, self.right_sheet ])) for f in as_completed(futures): if f.exception(): logging.error("recived Exception from thread {}".format( f.exception())) raise f.exception() else: logging.info("recived result from thread {}".format( f.result())) return self.left_file_path, self.right_file_path
def run_desired_simulataneous_get_object_calls( self, s3_olap_arn, file_name, expected_error, connection_counts=DEFAULT_CONNECTIONS, period_in_minutes=DEFAULT_PERIOD, ): logging.info( f"Running Load Test for file : {file_name} for period {period_in_minutes} with connection counts: {connection_counts}" ) s = ThreadPoolExecutor(max_workers=connection_counts) futures = [ s.submit(self.fail_safe_fetch_s3_object, s3_olap_arn, file_name, period=period_in_minutes * 60, expected_error=expected_error) for i in range(0, connection_counts) ] total_counts = 0 successful_counts = 0 average_latency = 0 for f in as_completed(futures): successful_counts += f.result()[1] total_counts += f.result()[0] + f.result()[1] average_latency = f.result()[2] / total_counts logging.info( f" Total calls made: {total_counts}, out of which {successful_counts} calls were successful." f" ({successful_counts / total_counts * 100}%) ,Average Latency {average_latency}" ) return total_counts, total_counts - successful_counts, average_latency
def contains_pii_entities( self, documents: List[Document], language=DEFAULT_LANGUAGE_CODE) -> List[Document]: """Call comprehend to get pii classification of given documents.""" documents_copy = deepcopy(documents) result = [] with self.classification_executor_service: futures = [] for doc in documents_copy: futures.append( self.classification_executor_service.submit( self._update_doc_with_pii_classification, doc, language)) for future_result in as_completed(futures): try: result.append(future_result.result()) except Exception as error: LOG.error( "Error occurred while calling comprehend for classifying text as pii", exc_info=True) self.classify_metrics.add_fault_count() raise error return result
def reap( path, known_bad_packages=(), number_to_reap=1000, ): if not os.path.exists(path): os.makedirs(path) sorted_files = list(diff(path)) print(f"TOTAL OUTSTANDING ARTIFACTS: {len(sorted_files)}") sorted_files = sorted_files[:number_to_reap] with executor(max_workers=5, kind="dask") as pool: futures = { pool.submit( fetch_and_run, path, package, dst, src_url, # progress.update ): (package, dst, src_url) for package, dst, src_url in sorted_files if (src_url not in known_bad_packages) } for f in tqdm(as_completed(futures), total=len(sorted_files)): try: f.result() except ReapFailure as e: print(f"FAILURE {e.args}") except Exception: pass
def report_conda_forge_names_from_import_map(total_imports, builtin_modules=None, ignore=None): if ignore is None: ignore = [] if builtin_modules is None: builtin_modules = _builtin_modules report_keys = [ 'required', 'questionable', 'builtin', 'questionable no match', 'required no match' ] report = {k: set() for k in report_keys} import_to_pkg = {k: {} for k in report_keys} import_to_artifact = {k: {} for k in report_keys} futures = {} with ThreadPoolExecutor() as pool: for name, md in total_imports.items(): if all([ any( fnmatch(filename, ignore_element) for ignore_element in ignore) for filename, _ in md ]): continue elif recursively_search_for_name(name, builtin_modules): report['builtin'].add(name) continue future = pool.submit(extract_pkg_from_import, name) futures[future] = md for future in as_completed(futures): md = futures[future] most_likely_pkg, _import_to_artifact, _import_to_pkg = future.result() for (filename, lineno), import_metadata in md.items(): # Make certain to throw out imports, since an import can happen multiple times # under different situations, import matplotlib is required by a test file # but is questionable for a regular file if any( fnmatch(filename, ignore_element) for ignore_element in ignore): continue if any( import_metadata.get(v, False) for v in SKETCHY_TYPES_TABLE.values()): # if we couldn't find any artifacts to represent this then it doesn't exist in our maps if not _import_to_artifact: report_key = 'questionable no match' else: report_key = 'questionable' else: # if we couldn't find any artifacts to represent this then it doesn't exist in our maps if not _import_to_artifact: report_key = 'required no match' else: report_key = 'required' report[report_key].add(most_likely_pkg) import_to_pkg[report_key].update(_import_to_pkg) import_to_artifact[report_key].update(_import_to_artifact) return report, import_to_artifact, import_to_pkg
def test_emergency_shutdown(self, mock_scan_commands): # Given a lot of servers to scan total_server_scans_count = 100 server_scans = [ ServerScanRequest( server_info=ServerConnectivityInfoFactory.create(), scan_commands={ScanCommandForTests.MOCK_COMMAND_1, ScanCommandForTests.MOCK_COMMAND_2}, ) for _ in range(total_server_scans_count) ] # And the scans get queued scanner = Scanner() for scan in server_scans: scanner.queue_scan(scan) # When trying to quickly shutdown the scanner, it succeeds scanner.emergency_shutdown() # And all the queued jobs were done or cancelled all_queued_futures = [] for server_scan in scanner._queued_server_scans: all_queued_futures.extend(server_scan.all_queued_scan_jobs) for completed_future in as_completed(all_queued_futures): assert completed_future.done()
def post_api(self, datas): params = [] for data in datas: title = data.get("Title2") if not title: title = data.get('SecuAbbr') + data.get("Title1") req_data = { 'texttype': 'ann', 'title': title, 'content': title, 'prolist': ['event_ann'], } params.append((req_data, data, title)) items = [] with ThreadPoolExecutor(max_workers=10) as t: res = [t.submit(self.post_task, *param) for param in params] for future in as_completed(res): item = future.result() if item: items.append(item) # for param in params: # try: # item = self.post_task(*param) # except: # item = None # if item: # items.append(item) return items
def fetchDataAsList(self): with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for source in self.subSources: futures.append(executor.submit(source.fetchDataAsList)) for future in as_completed(futures): print('completed load of source data in list')
def have_valid_relative_data(self, data, relative="child_relation"): with ProcessPoolExecutor() as executor: res = [] for index, relation_meta in enumerate(self.metadata_c.metadata["sequential_info"][relative]): res.append( executor.submit(self._search_file, relation_meta["path_of_child_table"], index, data, relative)) for f in as_completed(res): if not f.result(): return False return True
def scan_ports(self, ports: typing.Iterable[int]) -> typing.List[ScanResult]: futures = [] with Halo(text="***** Port scanning in progress...", color="blue"): with self.executor as executor: for port in ports: futures.append(executor.submit(self.scan, port)) return [ x for x in [f.result() for f in as_completed(futures)] if x.is_open ]
def fix_metadata(date, workers): uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml" fetcher = S3Fetcher(aws_unsigned=True) s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher) s3_url_stream = (o.url for o in s3_obj_stream) data_stream = list(fetcher(s3_url_stream)) with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream] for future in as_completed(futures): if future.exception() is not None: raise future.exception()
def find(host_list: list): records = [] with ThreadPoolExecutor(max_workers=cpu_count() + 1) as pool: task_list = [] for host in host_list: obj = pool.submit(parse_url, host.strip().replace("\n", "")) task_list.append(obj) for task in as_completed(task_list): print(task.result()) records.append(task.result())
def t1(): def job(a): time.sleep(random.random()) return a * 2 with ThreadPoolExecutor(max_workers=50) as executor: future_to_param = {} for i in range(30): future_to_param[executor.submit(job, i)] = i for future in as_completed(future_to_param): result = future.result() print(f"{future_to_param[future]} -> {result}")
def main(granule_ids, sns_topic_arn, workers): """ Script to sync Sentinel-2 data from NCI to AWS S3 bucket Pass in a file containing destination S3 urls that need to be uploaded. """ setup_logging() granule_ids = [granule_id.strip() for granule_id in granule_ids.readlines()] _LOG.info(f"{len(granule_ids)} granules to upload.") with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(upload_granule, granule_id, sns_topic_arn) for granule_id in granule_ids] for future in tqdm(as_completed(futures), total=len(granule_ids), unit='granules', disable=None): _LOG.info(f"Completed uploaded: {future.result()}")
def main(args, device, num_available_devices): model_path = Path(args.model) root_dir = Path(args.root_dir) image_paths = [ file_name for file_name in root_dir.glob('**/*') if is_image(file_name) ] analyzed_images = [] ctx = multiprocessing.get_context('forkserver') executor = ProcessPoolExecutor(max_workers=num_available_devices, mp_context=ctx, initializer=init_process, initargs=(model_path, not args.no_split, device)) try: with executor: current_jobs = [] for i, image_path in enumerate(image_paths): submitted_job = executor.submit( consumer, image_path, str(image_path.relative_to(root_dir))) current_jobs.append(submitted_job) for job in tqdm(as_completed(current_jobs), total=len(current_jobs)): try: result = job.result() analyzed_images.append(result) except Exception as e: print(f"Could not process {str(image_path)}, reason: {e}") traceback.print_exc(file=sys.stdout) except KeyboardInterrupt: pass with (root_dir / 'handwriting_analysis.json').open('w') as f: json.dump(analyzed_images, f, indent='\t') num_has_handwriting = len( [im for im in analyzed_images if im['has_handwriting']]) print( f"Handwriting to no handwriting ratio: {num_has_handwriting / len(analyzed_images)}" )
def main(s3_urls, workers): """ Script to sync Sentinel-2 data from NCI to AWS S3 bucket Pass in a file containing destination S3 urls that need to be uploaded. """ setup_logging() global S3 S3 = s3_client() urls_to_upload = [url.strip() for url in s3_urls.readlines()] _LOG.info(f"{len(urls_to_upload)} datasets to upload.") with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(upload_dataset, s3_url) for s3_url in urls_to_upload] for future in tqdm(as_completed(futures), total=len(urls_to_upload), unit='datasets', disable=None): _LOG.info(f"Completed uploaded: {future.result()}")
def compare_grayskull_audits(gx): grayskull_files = os.listdir("audits/grayskull") bad_inspections = {} if "_net_audit.json" in grayskull_files: grayskull_files.pop(grayskull_files.index("_net_audit.json")) with open("audits/grayskull/_net_audit.json") as f: bad_inspections = load(f) futures = {} with executor("dask", max_workers=20) as pool: for node, attrs in gx.nodes("payload"): if not attrs.get("version"): continue node_version = f"{node}_{attrs['version']}" if node_version in bad_inspections: continue # construct the expected filename expected_filename = f"{node_version}.yml" if expected_filename in grayskull_files: with open( os.path.join("audits/grayskull", expected_filename), ) as f: meta_yaml = f.read() futures[ pool.submit( inner_grayskull_comparison, meta_yaml=meta_yaml, attrs=attrs, node=node, ) ] = node_version for future in as_completed(futures): try: bad_inspections[futures[future]] = future.result() except Exception as e: bad_inspections[futures[future]] = str(e) with open("audits/grayskull/_net_audit.json", "w") as f: dump(bad_inspections, f) return bad_inspections
def launch(self): self._yuqing_init() # 2020-09-01 - 2020-10-01 end_time = datetime.datetime(2020, 11, 6) start_time = datetime.datetime(2020, 10, 20) dt = start_time while dt <= end_time: dt_next = dt + datetime.timedelta(days=1) limit_start = 0 while True: # TODO 1002 东财 1007 同花顺 sql = '''select * from {} where OrgTableCode = '1002' and PubDatetime >= '{}' and PubDatetime <= '{}' order by id limit {}, {};'''.format( self.source_table, dt, dt_next, limit_start * self.batch_num, self.batch_num, ) print(sql) datas = self.yuqing_client.select_all(sql) print("select datas: ", len(datas)) if len(datas) == 0: break items = [] with ThreadPoolExecutor(max_workers=10) as t: res = [t.submit(self.post_api, data) for data in datas] for future in as_completed(res): item = future.result() if item: items.append(item) print(limit_start, len(items)) if items: self._batch_save(self.yuqing_client, items, self.target_table, self.target_fields) self.yuqing_client.end() limit_start += 1 dt = dt_next
def get_links(self): if not self.urls: print("done, image links parsed: %d" % len(self.images)) print(*self.images, sep='\n') return self.images with ThreadPoolExecutor(cpu_count()) as executor: future_to_page = { executor.submit(Scraper.one_page_crawl, page_url): page_url for page_url in self.urls } for future in as_completed(future_to_page): url_done = future_to_page[future] links, images = future.result() self.images |= images self.done.add(url_done) self.urls.remove(url_done) self.urls |= links - self.done print("\nurls to crawl: %s\nurls done: %s\n" % (self.urls, self.done)) return self.get_links()
def parse_parallel(parser: AppStoreParser, max_workers=20) -> List[Review]: """ Parse app reviews in parallel :AppStoreParser parser: parser object :int max_workers: the maximum number of threads that can be used to parse reviews """ rating_count = parser.get_app_rating_count() LOGGER.info(f'App "{parser.app_name}" has {rating_count} reviews') if rating_count > MAX_REVIEWS: rating_count = MAX_REVIEWS LOGGER.warning(f'App "{parser.app_name}" has more than {MAX_REVIEWS} reviews') last_page = rating_count // REVIEWS_PER_PAGE LOGGER.info(f'Reviews to scan: {rating_count}') results = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: pages_range = range(1, last_page + 1) future_to_page = { executor.submit(parser.get_reviews_page, page): page for page in pages_range } for future in as_completed(future_to_page): page = future_to_page[future] try: reviews = future.result() except Exception as exc: LOGGER.error(f'Exception on page #{page:03d}: {exc!r}') else: LOGGER.info(f'Page #{page:03d} successfully scanned') results.append((page, reviews)) results.sort(key=itemgetter(0)) reviews = [r for reviews_from_page in results for r in reviews_from_page[1]] LOGGER.info(f'Scanned reviews: {len(reviews)}') return reviews
def find_supplying_version_set( volume, get_symbol_table_func=web_interface.get_symbol_table): supplying_versions = {} effective_volume = sorted(volume - builtin_symbols) symbol_by_top_level = groupby(effective_volume, key=lambda x: x.partition(".")[0]) bad_symbols = set() with ThreadPoolExecutor() as pool: futures = { pool.submit(get_supply, top_level_import, list(v_symbols), get_symbol_table_func): top_level_import for top_level_import, v_symbols in symbol_by_top_level } for future in as_completed(futures): top_level_import = futures[future] supplies, bad = future.result() supplying_versions[top_level_import] = supplies bad_symbols.update(bad) # TODO: handle the case where multiple pkgs export the same symbols? # In that case we may want to merge those together somehow # TODO: handle case where no pkg supports the symbol? return supplying_versions, bad_symbols
def main(n_to_pull=1000): path = "audit" if os.path.exists(os.path.join(path, "_inspection_version.txt")): with open(os.path.join(path, "_inspection_version.txt")) as f: db_version = f.read() else: db_version = "" if db_version != complete_version and os.path.exists(path): shutil.rmtree(path) if not os.path.exists(path): os.makedirs(path) with open(os.path.join(path, "_inspection_version.txt"), "w") as f: f.write(complete_version) all_extracted_artifacts = web_interface.get_current_extracted_pkgs() existing_artifacts = glob.glob(f"{path}/**/*.json", recursive=True) existing_artifact_names = {k.partition("/")[2] for k in existing_artifacts} artifacts = sorted( list(set(all_extracted_artifacts) - set(existing_artifact_names))) # Don't have the artifacts in alphabetical order shuffle(artifacts) with ThreadPoolExecutor() as pool: futures = [ pool.submit(inner_loop_and_write, artifact) for artifact in artifacts[:n_to_pull] ] for future in tqdm(as_completed(futures), total=n_to_pull): try: future.result() except requests.exceptions.ConnectionError: pass
def handle(self, *args, **options): url = options['url'] tmp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp') start = time.time() # Открываем соединение для скачивания большого файла with requests.get(url, stream=True) as r: r.raise_for_status() # Счетчик i для ограничения скачивания i = 0 for chunk in r.iter_lines(chunk_size=100000, decode_unicode=True): if i >= MAX_LINES_COUNT: # Около 100мб break if not chunk: continue # Случайное распределение по файлам в зав-ти от кол-ва ядер системы (порядок строк неважен) with open(f'{tmp_path}/access_log{random.randint(0, min(32, (os.cpu_count() or 1) + 4) - 1)}', 'at') as f: f.write(f'{chunk}\n') i += 1 download_time = time.time() print(f'Скачивание логов: {download_time - start}') # Создаем пул потоков, отправляем в каждый функцию для чтения файла, ожидаем строку с завершением обработки with ThreadPoolExecutor() as executor: futures = [executor.submit(self.bulk_create_logs, f'{tmp_path}/{filename}') for filename in os.listdir(tmp_path)] for idx, future in enumerate(as_completed(futures)): print(f'Файл №{idx} обработан') print(f'Импорт завершен. Время обработки: {time.time() - download_time}') if DELETE_AFTER_IMPORT: for filename in os.listdir(tmp_path): os.unlink(f'{tmp_path}/{filename}')
return all_symbol_tables if __name__ == "__main__": web_interface = WebDB() extracted_artifacts = web_interface.get_current_symbol_table_artifacts() all_artifacts = web_interface.get_current_extracted_pkgs().values() artifacts_to_index = list(set(all_artifacts) - set(extracted_artifacts)) print(f"Number of artifacts to index: {len(artifacts_to_index)}") # The shuffle here is to try to not have two threads running on the same symbol table json at once if possible shuffle(artifacts_to_index) pool = ThreadPoolExecutor() # Note that this is a race condition here, two threads could try to write to the same symbol table # however one of those will win so next round there will be one added safely and this continues # until none are left to be added print("issuing futures") futures = { pool.submit(inner_loop, artifact_name): artifact_name for artifact_name in tqdm(artifacts_to_index[:10000]) } print("awaiting futures") for future in tqdm(as_completed(futures), total=len(futures)): print(futures[future]) try: future.result() except Exception as e: print(e) pool.shutdown()