Exemplo n.º 1
0
def scrape(input_path,
           output_path="~/Downloads",
           max_workers=20,
           sample_size=None,
           silly=None):
    # Read products from input csv
    prod_dicts = read_product_link_csv(input_path)
    products = list(map(Product.from_dict, prod_dicts))

    # Trim list of products if sample_size was provided
    if sample_size is not None and sample_size < len(products):
        products = products[:sample_size]

    # Uses concurrent.futures.TheadPoolExecutor.map() to fetch results
    print(f"Scraping metadata for {len(products)} products.")
    thread_map(lambda p: p.update_metadata(),
               products,
               max_workers=max_workers)

    if silly:
        products = sillify(products)

    # Write to file
    write_product_link_csv(output_path, [p.to_dict() for p in products],
                           listing_type="scrape")
Exemplo n.º 2
0
    def process_downloads(self, sites, collection):
        """
        Method to download and process files

        :param sites: List of file to download and process
        :type sites: list
        :param collection: Mongodb Collection name
        :type collection: str
        :return:
        :rtype:
        """

        worker_size = min(32, os.cpu_count() + 4)

        start_time = time.time()

        thread_map(self.download_site, sites, desc="Downloading files")

        if self.do_process:
            thread_map(
                self.file_to_queue,
                self.file_queue.get_full_list(),
                desc="Processing downloaded files",
            )

            self._process_queue_to_db(worker_size, collection=collection)

            # checking if last-modified was in the response headers and not set to default
            if "01-01-1970" != self.last_modified.strftime("%d-%m-%Y"):
                setColUpdate(self.feed_type.lower(), self.last_modified)

        self.logger.info("Duration: {}".format(
            timedelta(seconds=time.time() - start_time)))
Exemplo n.º 3
0
def preprocess_idrid_grade(root_dir: str, output_dir: str, n_workers: int, train: bool):
    root_path = Path(root_dir)

    output_path = Path(output_dir) / "idrid_grade"

    img_output_path = output_path / "img"
    img_output_path.mkdir(parents=True, exist_ok=True)

    img_transformed_output_path = output_path / "transformed"
    img_transformed_output_path.mkdir(parents=True, exist_ok=True)

    train_test_path = "a. Training Set" if train else "b. Testing Set"

    retina_path = root_path / "1. Original Images" / train_test_path
    print(retina_path)

    image_names = [f.name for f in retina_path.glob("**/*")]

    # Worker function that wraps the image processing function.
    def worker(image_name: str):
        process_image(
            image_name=image_name,
            retina_path=retina_path,
            img_output_path=img_output_path,
            img_transformed_output_path=img_transformed_output_path,
        )

    print(f"Preprocessing IDRiD Grade ({train_test_path}) with {n_workers} workers...")
    thread_map(worker, image_names, max_workers=n_workers)
Exemplo n.º 4
0
def test_thread_map():
    """Test contrib.concurrent.thread_map"""
    with closing(StringIO()) as our_file:
        a = range(9)
        b = [i + 1 for i in a]
        try:
            assert thread_map(lambda x: x + 1, a, file=our_file) == b
        except ImportError:
            raise SkipTest
        assert thread_map(incr, a, file=our_file) == b
Exemplo n.º 5
0
 def _search_columns(col, *filters):
     outputs = synth_db.query(NHMOutput).filter(
         NHMOutput.Output_ID.notin_(self._added), *filters)
     thread_workers = context.config.resource_opt(
         'dois.threads', 20)
     with self, ThreadPoolExecutor(
             thread_workers) as thread_executor:
         thread_map(lambda x: _extract_doi(self, x, col),
                    outputs.all(),
                    desc=col,
                    unit=' records',
                    leave=False,
                    position=1)
def process_df(df, f, group_name):
    # We save as [C, H, W]
    train_shape = (len(df), 3, 512, 512)
    train_labels = df["level"].to_numpy()

    grp = f.create_group(group_name)
    grp.create_dataset(
        "images",
        shape=train_shape,
        dtype=np.uint8,
        # compression="lzf",
    )
    grp.create_dataset("labels", shape=train_labels.shape, dtype=np.uint8)
    grp["labels"][...] = train_labels

    grp_images = grp["images"]

    def process_image(i):
        path = df["path"].iloc[i]
        img = open_colour_image(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        contour = find_eye(img)
        x, y, w, h = cv2.boundingRect(contour)
        img = img[y : y + h, x : x + w]
        img = pad_to_square(img, w, h, [BLACK, BLACK, BLACK])

        img = cv2.resize(img, (512, 512))

        scale = 512
        img = cv2.addWeighted(
            img, 4, cv2.GaussianBlur(img, (0, 0), scale / 30), -4, 128
        )

        # Remove outer 10% boundary effects
        b = np.zeros(img.shape)
        b = cv2.circle(
            b,
            (img.shape[1] // 2, img.shape[0] // 2),
            int(scale * 0.9) // 2,
            (1, 1, 1),
            -1,
            8,
            0,
        )
        img = img * b + 128 * (1 - b)

        img = np.transpose(img, (2, 0, 1))
        grp_images[i, ...] = img[None]

    thread_map(process_image, range(len(df)), max_workers=8)
Exemplo n.º 7
0
    def run_multithread(self,
                        max_workers: int | None = None,
                        quiet: bool = False) -> None:
        """
        Run the process for all genes in the Population.

        Parameters
        ----------
        :max_workers:   Number of concurrent workers. If `None`, the algorithm will get the
                        maximum possible (default: `None`).
        :quiet:         If `False`, a bar will show the simulation progress (default: `False`).

        Updates
        -------
        self.datasets.results DataFrame
        """
        def sim(x):
            index, row = x
            return pd.DataFrame(self.process(**row),
                                index=pd.Series([index], name='id'))

        iterable = self.datasets.population.iterrows()
        if quiet:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                creature_result_list = executor.map(sim, iterable)
        else:
            population_size = self.population.size
            creature_result_list = thread_map(sim,
                                              iterable,
                                              total=population_size,
                                              position=1,
                                              desc='Simulating Population',
                                              max_workers=max_workers)
        self.datasets.results = pd.concat(creature_result_list)
Exemplo n.º 8
0
def process(files, output):
    print('Total files: {}'.format(len(files)))
    links = []
    print('Extracting links from files...')
    for filename in files:
        name_file = path.basename(filename)
        realname = name_file.replace('.txt', '').replace('_', ' ')
        alias = make_alias(realname)
        ifile = open(filename, 'r')
        reader = csv.reader(ifile, delimiter=' ')
        for row in reader:
            entry = Link(
                row[0],
                row[1],
                row[2],
                row[3],
                row[4],
                row[5],
                row[6],
                row[7],
                row[8],
                realname,
                alias,
            )
            links.append(entry)
    shuffle(links)
    return thread_map(retrieve, links, [output] * len(links), max_workers=128)
Exemplo n.º 9
0
def num_exposures_needed(runs, metrics=default_metrics):
    """
    Generate the approximate number of exposures needed for a low-enough
    reconstruction error when choosing with a given strategy.
    """
    data_gen = DataGenerator(
        raw_path=DATA_DIR / "dngs",
        out_path=DATA_DIR / "correct_exposures",
        compute_scores=False,
    )
    exp = data_gen.exposures
    exposures = list(flatten((exp[exp < 0], exp[exp > 0])))
    all_image_names = data_gen.image_names

    stats_fun = partial(compute_stats,
                        data_gen=data_gen,
                        exposures=exposures,
                        metrics=metrics)
    err_list = flatten(thread_map(
        stats_fun,
        sample(all_image_names, k=runs),
    ))

    df = pd.DataFrame.from_records(err_list, index="image_name")
    return df
Exemplo n.º 10
0
def check_images(data_dir: str) -> Tuple[List[str], List[str]]:
    '''Iterate through a directory of images and find corrupt images

    Args:
        data_dir: Path to the directory containing the images

    Returns:
        (healthy_images, corrupt_images)
    '''
    dataset = LightlyDataset(input_dir=data_dir)
    filenames = dataset.get_filenames()

    def _is_corrupt(filename):
        try:
            image = Image.open(os.path.join(data_dir, filename))
            image.load()
        except (IOError, UnidentifiedImageError):
            return True
        else:
            return False

    mapped = concurrent.thread_map(_is_corrupt,
                                   filenames,
                                   chunksize=min(32, len(filenames)))
    healthy_images = [
        f for f, is_corrupt in zip(filenames, mapped) if not is_corrupt
    ]
    corrupt_images = [
        f for f, is_corrupt in zip(filenames, mapped) if is_corrupt
    ]
    return healthy_images, corrupt_images
Exemplo n.º 11
0
def enhance_recipe_1m_data():
    output_file = Path('comment_scraping/data/recipe1m_with_reviews.json')
    if output_file.exists():
        with output_file.open('r') as f:
            data_with_reviews = json.load(f)
    else:
        data_with_reviews = []

    with open('../data/recipe1m.json') as file:
        data = json.load(file)
        data = filter_recipe1m_data_after_url(data)

    processed_ids = {elem['id'] for elem in data_with_reviews}
    data = [elem for elem in data if elem['id'] not in processed_ids]
    print(len(data))
    print("loaded recipe1m data")

    chunksize = 2000
    chunked_data = [data[x:x + chunksize] for x in range(0, len(data), chunksize)]
    for chunk in tqdm(chunked_data, desc='Chunks'):
        chunk_reviews = thread_map(enhance_recipe, chunk, max_workers=1000, total=len(chunk), desc='Processing Chunk')
        chunk_reviews = [review for review in chunk_reviews if review]
        data_with_reviews.extend(chunk_reviews)
        with output_file.open('w') as f:
            json.dump(data_with_reviews, f)

    print("Finished scraping comments.")
Exemplo n.º 12
0
    def fetch(self):
        submodules_paths = self.repo.listall_submodules()
        diff_to_tree = self.revision_commit.tree.diff_to_tree()
        files_to_blame = [p.delta.new_file.path for p in diff_to_tree
                          if not p.delta.is_binary and p.delta.new_file.path not in submodules_paths]

        results = thread_map(self.blame_file, files_to_blame)
        return [rec for val in results for rec in val]
Exemplo n.º 13
0
def search_aio(keywords, keywords_file, api_key, max_workers=3, ret=0, keep_cache=False, case_sensitive=False):
    # new entrance and save memory
    # fetch paper details
    logging.info("searching from NCBI PMC.......")
    key_encoded = quote(keywords)
    paper_links = search_links(key_encoded, ret)
    try:
        if ret == 0:
            result = thread_map(search_aio_sub, [(x, keywords_file, api_key, keep_cache, case_sensitive)
                                                 for x in paper_links], max_workers=max_workers)
        else:
            result = thread_map(search_aio_sub, [(x, keywords_file, api_key, keep_cache, case_sensitive)
                                                 for x in paper_links[:ret]], max_workers=max_workers)

        return list(filter(lambda x: x is not None, result))

    except Exception as e:
        logging.exception("search exception (big one)")
Exemplo n.º 14
0
def fetch_products_threaded(category, page_size, max_workers=20, dry_run=False):
    n_prods = get_number_of_products(category)
    n_pages = get_number_of_pages(n_prods, page_size)
    fetch_func = fetch_func_factory(category, page_size)

    page_range = [1] if dry_run else range(1, n_pages + 1)

    res = thread_map(fetch_func, page_range, max_workers=max_workers)

    return [p for r in res for p in r]
def get_daily_prices(
        api_key: str,
        base_url: str,
        historical: Optional[pd.DataFrame] = None) -> pd.DataFrame:
    if historical is not None:
        price_func = partial(_get_raw_prices, api_key=api_key)
        df = pd.concat(
            thread_map(price_func,
                       historical.symbol.unique(),
                       desc="get symbol prices."))
    return df
Exemplo n.º 16
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts):
        props: Dict[str, str] = parse_opts["properties"]
        data = aux["knowledge_graph"].merge(aux["metadata"])[["key", "wikidata"]].set_index("key")

        # Load wikidata using parallel processing
        map_iter = data.wikidata.iteritems()
        map_func = partial(WikidataPipeline._process_item, props)
        records = concurrent.thread_map(map_func, map_iter, total=len(data))

        # Return all records in DataFrame form
        return DataFrame.from_records(records)
Exemplo n.º 17
0
    def get_images(self):
        with self.session as s:
            print("Scraping Image Urls...")
            self.get_image_urls(s, self.url, page_count=True, page_title=True)

            print(f"Found Total {self.total_pages} Pages")

            print("Scraping On Page: ", 1)
            if self.total_pages > 1:
                for i in range(2, self.total_pages + 1):
                    print("Scraping On Page: ", i)
                    self.get_image_urls(s, f"{self.url}&page={i}")
                    time.sleep(1.5)

            print(f"Found {len(self.image_urls)} Images")

            if not Path.exists(self.file_path):
                Path.mkdir(self.file_path)
            
            thread_map(self._download_images, range(0, len(self.image_urls)), max_workers=2)
Exemplo n.º 18
0
    def update(self, context, target, *synth_sources):
        """
        Retrieve and store metadata for each the DOIs stored in the OutputDOIs resource.
        """
        with self:
            super(DOIMetadata, self).update(context, target, *synth_sources)
        self._handled = set()
        self._added = set()
        self._errors = {}

        doi_cache = OutputDOIs(context)
        with doi_cache:
            found_dois = list(set(doi_cache.data.values()))

        workers = context.config.resource_opt('doimetadata.threads', 20)
        with self, ThreadPoolExecutor(workers) as executor:
            thread_map(lambda x: self._get_metadata(self, x),
                       found_dois,
                       desc='Crossref',
                       unit=' dois',
                       leave=False,
                       position=1)
def _get_origin_metadata_locally_or_by_urls(
        data_files: List[Union[Path, Url]],
        max_workers=64,
        use_auth_token: Optional[Union[bool, str]] = None) -> Tuple[str]:
    return thread_map(
        partial(_get_single_origin_metadata_locally_or_by_urls,
                use_auth_token=use_auth_token),
        data_files,
        max_workers=max_workers,
        tqdm_class=logging.tqdm,
        desc="Resolving data files",
        disable=len(data_files) <= 16 or not logging.is_progress_bar_enabled(),
    )
Exemplo n.º 20
0
 def __init__(self, data_path: str, shuffle=False, train=True, max_len=500):
     super().__init__()
     logger.info("Loading data from {}".format(data_path))
     gc.disable()
     files = glob.glob(f"{data_path}*")
     logger.info("File list {}".format(", ".join(files)))
     dfs = thread_map(feather.read_feather, files, max_workers=16)
     self.data_df = pd.concat(dfs)
     gc.enable()
     logger.info(
         f"Loaded dataset from {data_path} with {len(self.data_df)} samples"
     )
     self.max_len = max_len
Exemplo n.º 21
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts):

        # Get all the weather stations with data up until 2020
        stations_url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt"
        stations = read_csv(
            stations_url,
            sep=r"\s+",
            names=("id", "lat", "lon", "measurement", "year_start",
                   "year_end"),
        )
        stations = stations[stations.year_end == 2020][[
            "id", "lat", "lon", "measurement"
        ]]

        # Filter stations that at least provide max and min temps
        measurements = ["TMIN", "TMAX"]
        stations = stations.groupby(["id", "lat",
                                     "lon"]).agg(lambda x: "|".join(x))
        stations = stations[stations.measurement.apply(
            lambda x: all(m in x for m in measurements))]
        stations = stations.reset_index()

        # Get all the POI from metadata and go through each key
        metadata = dataframes[0][["key", "latitude", "longitude"]].dropna()

        # Convert all coordinates to radians
        stations["lat"] = stations.lat.apply(math.radians)
        stations["lon"] = stations.lon.apply(math.radians)
        metadata["lat"] = metadata.latitude.apply(math.radians)
        metadata["lon"] = metadata.longitude.apply(math.radians)

        # Use a cache to avoid having to query the same station multiple times
        station_cache: Dict[str, DataFrame] = {}

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(WeatherPipeline.station_records, station_cache,
                           stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = [record for _, record in metadata.iterrows()]

        # Shuffle the iterables to try to make better use of the caching
        shuffle(map_iter)

        # Bottleneck is network so we can use lots of threads in parallel
        records = concurrent.thread_map(map_func,
                                        map_iter,
                                        total=len(metadata))

        return concat(records)
Exemplo n.º 22
0
def _get_origin_metadata_locally_or_by_urls(
        data_files: List[Union[Path, Url]],
        max_workers=64,
        use_auth_token: Optional[Union[bool, str]] = None) -> Tuple[str]:
    return thread_map(
        partial(_get_single_origin_metadata_locally_or_by_urls,
                use_auth_token=use_auth_token),
        data_files,
        max_workers=max_workers,
        tqdm_class=tqdm,
        desc="Resolving data files",
        disable=len(data_files) <= 16
        or logging.get_verbosity() == logging.NOTSET,
    )
Exemplo n.º 23
0
def main(args):
    # Phase 0 - copy all urmp wavs to corresponding folders
    CWD = Path(hydra.utils.get_original_cwd())
    os.chdir(CWD)

    if args.urmp is not None:
        urmp_path = CWD / args.urmp.source_folder
        urmp_audio_files = list(
            urmp_path.glob(f'./*/{args.urmp.mono_regex}*.wav'))
        target_dir = CWD / 'data_tmp'
        target_dir.mkdir(exist_ok=True)
        create_mono_urmp_partial = partial(
            create_mono_urmp,
            audio_files=urmp_audio_files,
            target_dir=target_dir,
            instruments_dict=args.urmp.instruments)
        thread_map(create_mono_urmp_partial,
                   list(args.urmp.instruments.keys()))

    # create hd5 datasets for each sample rate to be used later during training
    data_processor = hydra.utils.instantiate(args.data_processor)
    loudness_metrics = LoudnessMetrics(args.srs)
    data_processor.run_on_dirs(CWD / args.input_dir, CWD / args.output_dir,
                               loudness_metrics)
Exemplo n.º 24
0
def execute_in_threads(func: Callable, sequence: Generator,
                       max_sequence_length: int, **kwargs) -> List:
    """Executing 'func' in ThreadPoolExecutor for each
    set of arguments received from 'sequence' generator

    :param func: Callable to be executed in ThreadPoolExecutor.
    :param sequence: A generator yielding sets of arguments
    that are passed to 'func' on each call.
    :param max_sequence_length: Limits maximum number of 'func' calls in
    order to avoid possible infinite generator to block script execution.
    It is also passed as 'total' parameter to 'tqdm' progress bar.
    """
    results = thread_map(func,
                         islice(sequence, max_sequence_length),
                         total=max_sequence_length,
                         **kwargs)
    return list(results)
Exemplo n.º 25
0
def main():

    wiki = MediaWiki(url=URL)
    gen_entry_lam = lambda x: gen_entry(wiki, x)

    print("Retriving all titles")
    titles = getAllTitles(wiki)
    titles.sort()

    print("Retriving summaries for titles")
    entries = thread_map(gen_entry_lam, titles, max_workers=MAX_THREADS)
    entries = list(filter(None, entries))

    print("Creating file lines")
    file_lines = dic_creator.create_file_text_lines(entries)
    print("Writing to file")
    with open("content.html", "w") as file:
        file.writelines(file_lines)
Exemplo n.º 26
0
def request_etags(
    urls: List[str],
    use_auth_token: Optional[Union[str, bool]] = None,
    max_workers=64,
    tqdm_kwargs: Optional[dict] = None,
) -> List[Optional[str]]:
    tqdm_kwargs = tqdm_kwargs if tqdm_kwargs is not None else {}
    tqdm_kwargs["desc"] = tqdm_kwargs.get("desc", "Get ETags")
    tqdm_kwargs["disable"] = tqdm_kwargs.get(
        "disable",
        len(urls) <= 16 or logging.get_verbosity() == logging.NOTSET)
    return thread_map(
        partial(request_etag, use_auth_token=use_auth_token),
        urls,
        max_workers=max_workers,
        tqdm_class=tqdm,
        **tqdm_kwargs,
    )
Exemplo n.º 27
0
def download(
    base_url: str = hdrplus_bucket_base,
    out: Union[str, Path] = default_download_dir,
    image_names: Optional[List[str]] = None,
    max_threads=10,
) -> List[Optional[Path]]:

    if image_names is None:
        image_names = list(get_image_names(base_url))

    out = coerce_path(out)
    if not out.exists():
        out.mkdir(parents=True)

    downloaded_files = thread_map(
        partial(download_file, out_path=out),
        image_names,
        max_workers=max_threads,
        chunksize=min(50, len(image_names)),
    )

    return downloaded_files
Exemplo n.º 28
0
def convert(args):
    def ffmpeg_convert(fname):
        outfile = fname.replace('.m4a',
                                '.wav').replace('voxceleb2', 'voxceleb2_wav')
        if outfile in out_files:
            return 0
        outdir = os.path.dirname(outfile)
        os.makedirs(outdir, exist_ok=True)
        out = subprocess.call(
            'ffmpeg -v quiet -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s'
            % (fname, outfile))
        # if out != 0:
        # 	raise ValueError('Conversion failed %s'%fname)
        return out

    files = glob.glob('%s/voxceleb2/*/*/*.m4a' % args.save_path)
    files.sort()
    os.makedirs(os.path.join(args.save_path, 'voxceleb2_wav'), exist_ok=True)
    out_files = glob.glob('%s/voxceleb2_wav/*/*/*.wav' % args.save_path)

    print('Converting files from AAC to WAV')
    out = thread_map(ffmpeg_convert, files)
Exemplo n.º 29
0
    def process_downloads(self, sites):
        """
        Method to download and process files

        :param sites: List of file to download and process
        :type sites: list
        :return:
        :rtype:
        """

        start_time = time.time()

        thread_map(self.download_site, sites, desc="Downloading files")

        if self.do_process:
            thread_map(
                self.file_to_queue,
                self.file_queue.getall(),
                desc="Processing downloaded files",
            )

            chunks = []

            for batch in iter(lambda: list(islice(self.queue, 10000)), []):
                chunks.append(batch)

            thread_map(self._db_bulk_writer,
                       chunks,
                       desc="Transferring queue to database")

            # checking if last-modified was in the response headers and not set to default
            if "01-01-1970" != self.last_modified.strftime("%d-%m-%Y"):
                self.setColUpdate(self.feed_type.lower(), self.last_modified)

        self.logger.info("Duration: {}".format(
            timedelta(seconds=time.time() - start_time)))
Exemplo n.º 30
0
        sleep(interval)
    # NB: may not clear instances with higher `position` upon completion
    # since this worker may not know about other bars #796
    if write_safe:
        # we think we know about other bars (currently only py3 threading)
        if n == 6:
            tqdm.write("n == 6 completed")
    return n + 1


if __name__ == '__main__':
    freeze_support()  # for Windows support
    L = list(range(NUM_SUBITERS))[::-1]

    print("Simple thread mapping")
    thread_map(partial(progresser, write_safe=not PY2), L, max_workers=4)

    print("Simple process mapping")
    process_map(partial(progresser), L, max_workers=4)

    print("Manual nesting")
    for i in trange(16, desc="1"):
        for _ in trange(16, desc="2 @ %d" % i, leave=i % 2):
            sleep(0.01)

    print("Multi-processing")
    tqdm.set_lock(RLock())
    p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(), ))
    p.map(partial(progresser, progress=True), L)

    print("Multi-threading")