def scrape(input_path, output_path="~/Downloads", max_workers=20, sample_size=None, silly=None): # Read products from input csv prod_dicts = read_product_link_csv(input_path) products = list(map(Product.from_dict, prod_dicts)) # Trim list of products if sample_size was provided if sample_size is not None and sample_size < len(products): products = products[:sample_size] # Uses concurrent.futures.TheadPoolExecutor.map() to fetch results print(f"Scraping metadata for {len(products)} products.") thread_map(lambda p: p.update_metadata(), products, max_workers=max_workers) if silly: products = sillify(products) # Write to file write_product_link_csv(output_path, [p.to_dict() for p in products], listing_type="scrape")
def process_downloads(self, sites, collection): """ Method to download and process files :param sites: List of file to download and process :type sites: list :param collection: Mongodb Collection name :type collection: str :return: :rtype: """ worker_size = min(32, os.cpu_count() + 4) start_time = time.time() thread_map(self.download_site, sites, desc="Downloading files") if self.do_process: thread_map( self.file_to_queue, self.file_queue.get_full_list(), desc="Processing downloaded files", ) self._process_queue_to_db(worker_size, collection=collection) # checking if last-modified was in the response headers and not set to default if "01-01-1970" != self.last_modified.strftime("%d-%m-%Y"): setColUpdate(self.feed_type.lower(), self.last_modified) self.logger.info("Duration: {}".format( timedelta(seconds=time.time() - start_time)))
def preprocess_idrid_grade(root_dir: str, output_dir: str, n_workers: int, train: bool): root_path = Path(root_dir) output_path = Path(output_dir) / "idrid_grade" img_output_path = output_path / "img" img_output_path.mkdir(parents=True, exist_ok=True) img_transformed_output_path = output_path / "transformed" img_transformed_output_path.mkdir(parents=True, exist_ok=True) train_test_path = "a. Training Set" if train else "b. Testing Set" retina_path = root_path / "1. Original Images" / train_test_path print(retina_path) image_names = [f.name for f in retina_path.glob("**/*")] # Worker function that wraps the image processing function. def worker(image_name: str): process_image( image_name=image_name, retina_path=retina_path, img_output_path=img_output_path, img_transformed_output_path=img_transformed_output_path, ) print(f"Preprocessing IDRiD Grade ({train_test_path}) with {n_workers} workers...") thread_map(worker, image_names, max_workers=n_workers)
def test_thread_map(): """Test contrib.concurrent.thread_map""" with closing(StringIO()) as our_file: a = range(9) b = [i + 1 for i in a] try: assert thread_map(lambda x: x + 1, a, file=our_file) == b except ImportError: raise SkipTest assert thread_map(incr, a, file=our_file) == b
def _search_columns(col, *filters): outputs = synth_db.query(NHMOutput).filter( NHMOutput.Output_ID.notin_(self._added), *filters) thread_workers = context.config.resource_opt( 'dois.threads', 20) with self, ThreadPoolExecutor( thread_workers) as thread_executor: thread_map(lambda x: _extract_doi(self, x, col), outputs.all(), desc=col, unit=' records', leave=False, position=1)
def process_df(df, f, group_name): # We save as [C, H, W] train_shape = (len(df), 3, 512, 512) train_labels = df["level"].to_numpy() grp = f.create_group(group_name) grp.create_dataset( "images", shape=train_shape, dtype=np.uint8, # compression="lzf", ) grp.create_dataset("labels", shape=train_labels.shape, dtype=np.uint8) grp["labels"][...] = train_labels grp_images = grp["images"] def process_image(i): path = df["path"].iloc[i] img = open_colour_image(path) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) contour = find_eye(img) x, y, w, h = cv2.boundingRect(contour) img = img[y : y + h, x : x + w] img = pad_to_square(img, w, h, [BLACK, BLACK, BLACK]) img = cv2.resize(img, (512, 512)) scale = 512 img = cv2.addWeighted( img, 4, cv2.GaussianBlur(img, (0, 0), scale / 30), -4, 128 ) # Remove outer 10% boundary effects b = np.zeros(img.shape) b = cv2.circle( b, (img.shape[1] // 2, img.shape[0] // 2), int(scale * 0.9) // 2, (1, 1, 1), -1, 8, 0, ) img = img * b + 128 * (1 - b) img = np.transpose(img, (2, 0, 1)) grp_images[i, ...] = img[None] thread_map(process_image, range(len(df)), max_workers=8)
def run_multithread(self, max_workers: int | None = None, quiet: bool = False) -> None: """ Run the process for all genes in the Population. Parameters ---------- :max_workers: Number of concurrent workers. If `None`, the algorithm will get the maximum possible (default: `None`). :quiet: If `False`, a bar will show the simulation progress (default: `False`). Updates ------- self.datasets.results DataFrame """ def sim(x): index, row = x return pd.DataFrame(self.process(**row), index=pd.Series([index], name='id')) iterable = self.datasets.population.iterrows() if quiet: with ThreadPoolExecutor(max_workers=max_workers) as executor: creature_result_list = executor.map(sim, iterable) else: population_size = self.population.size creature_result_list = thread_map(sim, iterable, total=population_size, position=1, desc='Simulating Population', max_workers=max_workers) self.datasets.results = pd.concat(creature_result_list)
def process(files, output): print('Total files: {}'.format(len(files))) links = [] print('Extracting links from files...') for filename in files: name_file = path.basename(filename) realname = name_file.replace('.txt', '').replace('_', ' ') alias = make_alias(realname) ifile = open(filename, 'r') reader = csv.reader(ifile, delimiter=' ') for row in reader: entry = Link( row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], realname, alias, ) links.append(entry) shuffle(links) return thread_map(retrieve, links, [output] * len(links), max_workers=128)
def num_exposures_needed(runs, metrics=default_metrics): """ Generate the approximate number of exposures needed for a low-enough reconstruction error when choosing with a given strategy. """ data_gen = DataGenerator( raw_path=DATA_DIR / "dngs", out_path=DATA_DIR / "correct_exposures", compute_scores=False, ) exp = data_gen.exposures exposures = list(flatten((exp[exp < 0], exp[exp > 0]))) all_image_names = data_gen.image_names stats_fun = partial(compute_stats, data_gen=data_gen, exposures=exposures, metrics=metrics) err_list = flatten(thread_map( stats_fun, sample(all_image_names, k=runs), )) df = pd.DataFrame.from_records(err_list, index="image_name") return df
def check_images(data_dir: str) -> Tuple[List[str], List[str]]: '''Iterate through a directory of images and find corrupt images Args: data_dir: Path to the directory containing the images Returns: (healthy_images, corrupt_images) ''' dataset = LightlyDataset(input_dir=data_dir) filenames = dataset.get_filenames() def _is_corrupt(filename): try: image = Image.open(os.path.join(data_dir, filename)) image.load() except (IOError, UnidentifiedImageError): return True else: return False mapped = concurrent.thread_map(_is_corrupt, filenames, chunksize=min(32, len(filenames))) healthy_images = [ f for f, is_corrupt in zip(filenames, mapped) if not is_corrupt ] corrupt_images = [ f for f, is_corrupt in zip(filenames, mapped) if is_corrupt ] return healthy_images, corrupt_images
def enhance_recipe_1m_data(): output_file = Path('comment_scraping/data/recipe1m_with_reviews.json') if output_file.exists(): with output_file.open('r') as f: data_with_reviews = json.load(f) else: data_with_reviews = [] with open('../data/recipe1m.json') as file: data = json.load(file) data = filter_recipe1m_data_after_url(data) processed_ids = {elem['id'] for elem in data_with_reviews} data = [elem for elem in data if elem['id'] not in processed_ids] print(len(data)) print("loaded recipe1m data") chunksize = 2000 chunked_data = [data[x:x + chunksize] for x in range(0, len(data), chunksize)] for chunk in tqdm(chunked_data, desc='Chunks'): chunk_reviews = thread_map(enhance_recipe, chunk, max_workers=1000, total=len(chunk), desc='Processing Chunk') chunk_reviews = [review for review in chunk_reviews if review] data_with_reviews.extend(chunk_reviews) with output_file.open('w') as f: json.dump(data_with_reviews, f) print("Finished scraping comments.")
def fetch(self): submodules_paths = self.repo.listall_submodules() diff_to_tree = self.revision_commit.tree.diff_to_tree() files_to_blame = [p.delta.new_file.path for p in diff_to_tree if not p.delta.is_binary and p.delta.new_file.path not in submodules_paths] results = thread_map(self.blame_file, files_to_blame) return [rec for val in results for rec in val]
def search_aio(keywords, keywords_file, api_key, max_workers=3, ret=0, keep_cache=False, case_sensitive=False): # new entrance and save memory # fetch paper details logging.info("searching from NCBI PMC.......") key_encoded = quote(keywords) paper_links = search_links(key_encoded, ret) try: if ret == 0: result = thread_map(search_aio_sub, [(x, keywords_file, api_key, keep_cache, case_sensitive) for x in paper_links], max_workers=max_workers) else: result = thread_map(search_aio_sub, [(x, keywords_file, api_key, keep_cache, case_sensitive) for x in paper_links[:ret]], max_workers=max_workers) return list(filter(lambda x: x is not None, result)) except Exception as e: logging.exception("search exception (big one)")
def fetch_products_threaded(category, page_size, max_workers=20, dry_run=False): n_prods = get_number_of_products(category) n_pages = get_number_of_pages(n_prods, page_size) fetch_func = fetch_func_factory(category, page_size) page_range = [1] if dry_run else range(1, n_pages + 1) res = thread_map(fetch_func, page_range, max_workers=max_workers) return [p for r in res for p in r]
def get_daily_prices( api_key: str, base_url: str, historical: Optional[pd.DataFrame] = None) -> pd.DataFrame: if historical is not None: price_func = partial(_get_raw_prices, api_key=api_key) df = pd.concat( thread_map(price_func, historical.symbol.unique(), desc="get symbol prices.")) return df
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts): props: Dict[str, str] = parse_opts["properties"] data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]].set_index("key") # Load wikidata using parallel processing map_iter = data.wikidata.iteritems() map_func = partial(WikidataPipeline._process_item, props) records = concurrent.thread_map(map_func, map_iter, total=len(data)) # Return all records in DataFrame form return DataFrame.from_records(records)
def get_images(self): with self.session as s: print("Scraping Image Urls...") self.get_image_urls(s, self.url, page_count=True, page_title=True) print(f"Found Total {self.total_pages} Pages") print("Scraping On Page: ", 1) if self.total_pages > 1: for i in range(2, self.total_pages + 1): print("Scraping On Page: ", i) self.get_image_urls(s, f"{self.url}&page={i}") time.sleep(1.5) print(f"Found {len(self.image_urls)} Images") if not Path.exists(self.file_path): Path.mkdir(self.file_path) thread_map(self._download_images, range(0, len(self.image_urls)), max_workers=2)
def update(self, context, target, *synth_sources): """ Retrieve and store metadata for each the DOIs stored in the OutputDOIs resource. """ with self: super(DOIMetadata, self).update(context, target, *synth_sources) self._handled = set() self._added = set() self._errors = {} doi_cache = OutputDOIs(context) with doi_cache: found_dois = list(set(doi_cache.data.values())) workers = context.config.resource_opt('doimetadata.threads', 20) with self, ThreadPoolExecutor(workers) as executor: thread_map(lambda x: self._get_metadata(self, x), found_dois, desc='Crossref', unit=' dois', leave=False, position=1)
def _get_origin_metadata_locally_or_by_urls( data_files: List[Union[Path, Url]], max_workers=64, use_auth_token: Optional[Union[bool, str]] = None) -> Tuple[str]: return thread_map( partial(_get_single_origin_metadata_locally_or_by_urls, use_auth_token=use_auth_token), data_files, max_workers=max_workers, tqdm_class=logging.tqdm, desc="Resolving data files", disable=len(data_files) <= 16 or not logging.is_progress_bar_enabled(), )
def __init__(self, data_path: str, shuffle=False, train=True, max_len=500): super().__init__() logger.info("Loading data from {}".format(data_path)) gc.disable() files = glob.glob(f"{data_path}*") logger.info("File list {}".format(", ".join(files))) dfs = thread_map(feather.read_feather, files, max_workers=16) self.data_df = pd.concat(dfs) gc.enable() logger.info( f"Loaded dataset from {data_path} with {len(self.data_df)} samples" ) self.max_len = max_len
def parse_dataframes(self, dataframes: List[DataFrame], aux: Dict[str, DataFrame], **parse_opts): # Get all the weather stations with data up until 2020 stations_url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt" stations = read_csv( stations_url, sep=r"\s+", names=("id", "lat", "lon", "measurement", "year_start", "year_end"), ) stations = stations[stations.year_end == 2020][[ "id", "lat", "lon", "measurement" ]] # Filter stations that at least provide max and min temps measurements = ["TMIN", "TMAX"] stations = stations.groupby(["id", "lat", "lon"]).agg(lambda x: "|".join(x)) stations = stations[stations.measurement.apply( lambda x: all(m in x for m in measurements))] stations = stations.reset_index() # Get all the POI from metadata and go through each key metadata = dataframes[0][["key", "latitude", "longitude"]].dropna() # Convert all coordinates to radians stations["lat"] = stations.lat.apply(math.radians) stations["lon"] = stations.lon.apply(math.radians) metadata["lat"] = metadata.latitude.apply(math.radians) metadata["lon"] = metadata.longitude.apply(math.radians) # Use a cache to avoid having to query the same station multiple times station_cache: Dict[str, DataFrame] = {} # Make sure the stations and the cache are sent to each function call map_func = partial(WeatherPipeline.station_records, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = [record for _, record in metadata.iterrows()] # Shuffle the iterables to try to make better use of the caching shuffle(map_iter) # Bottleneck is network so we can use lots of threads in parallel records = concurrent.thread_map(map_func, map_iter, total=len(metadata)) return concat(records)
def _get_origin_metadata_locally_or_by_urls( data_files: List[Union[Path, Url]], max_workers=64, use_auth_token: Optional[Union[bool, str]] = None) -> Tuple[str]: return thread_map( partial(_get_single_origin_metadata_locally_or_by_urls, use_auth_token=use_auth_token), data_files, max_workers=max_workers, tqdm_class=tqdm, desc="Resolving data files", disable=len(data_files) <= 16 or logging.get_verbosity() == logging.NOTSET, )
def main(args): # Phase 0 - copy all urmp wavs to corresponding folders CWD = Path(hydra.utils.get_original_cwd()) os.chdir(CWD) if args.urmp is not None: urmp_path = CWD / args.urmp.source_folder urmp_audio_files = list( urmp_path.glob(f'./*/{args.urmp.mono_regex}*.wav')) target_dir = CWD / 'data_tmp' target_dir.mkdir(exist_ok=True) create_mono_urmp_partial = partial( create_mono_urmp, audio_files=urmp_audio_files, target_dir=target_dir, instruments_dict=args.urmp.instruments) thread_map(create_mono_urmp_partial, list(args.urmp.instruments.keys())) # create hd5 datasets for each sample rate to be used later during training data_processor = hydra.utils.instantiate(args.data_processor) loudness_metrics = LoudnessMetrics(args.srs) data_processor.run_on_dirs(CWD / args.input_dir, CWD / args.output_dir, loudness_metrics)
def execute_in_threads(func: Callable, sequence: Generator, max_sequence_length: int, **kwargs) -> List: """Executing 'func' in ThreadPoolExecutor for each set of arguments received from 'sequence' generator :param func: Callable to be executed in ThreadPoolExecutor. :param sequence: A generator yielding sets of arguments that are passed to 'func' on each call. :param max_sequence_length: Limits maximum number of 'func' calls in order to avoid possible infinite generator to block script execution. It is also passed as 'total' parameter to 'tqdm' progress bar. """ results = thread_map(func, islice(sequence, max_sequence_length), total=max_sequence_length, **kwargs) return list(results)
def main(): wiki = MediaWiki(url=URL) gen_entry_lam = lambda x: gen_entry(wiki, x) print("Retriving all titles") titles = getAllTitles(wiki) titles.sort() print("Retriving summaries for titles") entries = thread_map(gen_entry_lam, titles, max_workers=MAX_THREADS) entries = list(filter(None, entries)) print("Creating file lines") file_lines = dic_creator.create_file_text_lines(entries) print("Writing to file") with open("content.html", "w") as file: file.writelines(file_lines)
def request_etags( urls: List[str], use_auth_token: Optional[Union[str, bool]] = None, max_workers=64, tqdm_kwargs: Optional[dict] = None, ) -> List[Optional[str]]: tqdm_kwargs = tqdm_kwargs if tqdm_kwargs is not None else {} tqdm_kwargs["desc"] = tqdm_kwargs.get("desc", "Get ETags") tqdm_kwargs["disable"] = tqdm_kwargs.get( "disable", len(urls) <= 16 or logging.get_verbosity() == logging.NOTSET) return thread_map( partial(request_etag, use_auth_token=use_auth_token), urls, max_workers=max_workers, tqdm_class=tqdm, **tqdm_kwargs, )
def download( base_url: str = hdrplus_bucket_base, out: Union[str, Path] = default_download_dir, image_names: Optional[List[str]] = None, max_threads=10, ) -> List[Optional[Path]]: if image_names is None: image_names = list(get_image_names(base_url)) out = coerce_path(out) if not out.exists(): out.mkdir(parents=True) downloaded_files = thread_map( partial(download_file, out_path=out), image_names, max_workers=max_threads, chunksize=min(50, len(image_names)), ) return downloaded_files
def convert(args): def ffmpeg_convert(fname): outfile = fname.replace('.m4a', '.wav').replace('voxceleb2', 'voxceleb2_wav') if outfile in out_files: return 0 outdir = os.path.dirname(outfile) os.makedirs(outdir, exist_ok=True) out = subprocess.call( 'ffmpeg -v quiet -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s' % (fname, outfile)) # if out != 0: # raise ValueError('Conversion failed %s'%fname) return out files = glob.glob('%s/voxceleb2/*/*/*.m4a' % args.save_path) files.sort() os.makedirs(os.path.join(args.save_path, 'voxceleb2_wav'), exist_ok=True) out_files = glob.glob('%s/voxceleb2_wav/*/*/*.wav' % args.save_path) print('Converting files from AAC to WAV') out = thread_map(ffmpeg_convert, files)
def process_downloads(self, sites): """ Method to download and process files :param sites: List of file to download and process :type sites: list :return: :rtype: """ start_time = time.time() thread_map(self.download_site, sites, desc="Downloading files") if self.do_process: thread_map( self.file_to_queue, self.file_queue.getall(), desc="Processing downloaded files", ) chunks = [] for batch in iter(lambda: list(islice(self.queue, 10000)), []): chunks.append(batch) thread_map(self._db_bulk_writer, chunks, desc="Transferring queue to database") # checking if last-modified was in the response headers and not set to default if "01-01-1970" != self.last_modified.strftime("%d-%m-%Y"): self.setColUpdate(self.feed_type.lower(), self.last_modified) self.logger.info("Duration: {}".format( timedelta(seconds=time.time() - start_time)))
sleep(interval) # NB: may not clear instances with higher `position` upon completion # since this worker may not know about other bars #796 if write_safe: # we think we know about other bars (currently only py3 threading) if n == 6: tqdm.write("n == 6 completed") return n + 1 if __name__ == '__main__': freeze_support() # for Windows support L = list(range(NUM_SUBITERS))[::-1] print("Simple thread mapping") thread_map(partial(progresser, write_safe=not PY2), L, max_workers=4) print("Simple process mapping") process_map(partial(progresser), L, max_workers=4) print("Manual nesting") for i in trange(16, desc="1"): for _ in trange(16, desc="2 @ %d" % i, leave=i % 2): sleep(0.01) print("Multi-processing") tqdm.set_lock(RLock()) p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), )) p.map(partial(progresser, progress=True), L) print("Multi-threading")