def run_worker(data, versions, channels):
    t_pool = ThreadPool()
    t_pool.imap_unordered(partial(generate_statistics,
                                  versions=versions,
                                  channels=channels), data)
    t_pool.close()
    t_pool.join()
Пример #2
0
def find_process_files(root_dir):
    lock = Lock()

    try:
        num_proc = int(os.environ.get('SCIPY_NUM_CYTHONIZE_JOBS', ''))
        pool = Pool(processes=num_proc)
    except ValueError:
        pool = Pool()

    hash_db = load_hashes(HASH_FILE)
    # Keep changed pxi/pxd hashes in a separate dict until the end
    # because if we update hash_db and multiple files include the same
    # .pxi file the changes won't be detected.
    dep_hashes = {}

    # Run any _generate_pyx.py scripts
    jobs = []
    for cur_dir, dirs, files in os.walk(root_dir):
        generate_pyx = os.path.join(cur_dir, '_generate_pyx.py')
        if os.path.exists(generate_pyx):
            jobs.append(generate_pyx)

    for result in pool.imap_unordered(lambda fn: process_generate_pyx(fn, lock), jobs):
        pass

    # Process pyx files
    jobs = []
    for cur_dir, dirs, files in os.walk(root_dir):
        for filename in files:
            in_file = os.path.join(cur_dir, filename + ".in")
            if filename.endswith('.pyx') and os.path.isfile(in_file):
                continue
            for fromext, function in rules.items():
                if filename.endswith(fromext):
                    toext = ".c"
                    with open(os.path.join(cur_dir, filename), 'rb') as f:
                        data = f.read()
                        m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M)
                        if m:
                            toext = ".cxx"
                    fromfile = filename
                    tofile = filename[:-len(fromext)] + toext
                    jobs.append((cur_dir, fromfile, tofile, function,
                                 hash_db, dep_hashes, lock))

    for result in pool.imap_unordered(lambda args: process(*args), jobs):
        pass

    hash_db.update(dep_hashes)
    save_hashes(hash_db, HASH_FILE)
Пример #3
0
def SerializeHtmlTraces(results):
    """Creates html trace files for each story run, if necessary.

  For each story run, takes all trace files from individual trace agents
  and runs trace2html on them. This is done only once, subsequent calls to this
  function will not do anything.

  TODO(crbug.com/981349): Remove this function entirely when trace
  serialization has been handed over to results processor.
  """
    assert not results.current_story_run, 'Cannot serialize traces while running.'

    def _GetCpuCount():
        try:
            return multiprocessing.cpu_count()
        except NotImplementedError:
            # Some platforms can raise a NotImplementedError from cpu_count()
            logging.warn('cpu_count() not implemented.')
            return 8

    available_runs = list(run for run in results.IterRunsWithTraces())
    if not available_runs:
        return

    # Note that this is speculatively halved as an attempt to fix
    # crbug.com/953365.
    threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs))
    pool = ThreadPool(threads_count)
    try:
        for _ in pool.imap_unordered(_SerializeHtmlTraceInPool,
                                     available_runs):
            pass
    finally:
        pool.terminate()
        pool.join()
Пример #4
0
    def load(cls, docs, ignore_errors=False):
        """Force load the provided docs to read from file system."""
        if not docs:
            return

        pod = docs[0].pod

        def load_func(doc):
            """Force the doc to read the source file."""
            try:
                # pylint: disable=pointless-statement
                doc.has_serving_path()  # Using doc fields forces file read.
            except document_front_matter.BadFormatError:
                if not ignore_errors:
                    raise

        with pod.profile.timer('DocsLoader.load'):
            if ThreadPool is None or len(docs) < cls.MIN_POOL_COUNT:
                for doc in docs:
                    load_func(doc)
                return
            pool_size = min(cls.MAX_POOL_SIZE, len(docs) * cls.POOL_RATIO)
            pool_size = int(round(pool_size))
            thread_pool = ThreadPool(pool_size)
            results = thread_pool.imap_unordered(load_func, docs)
            # Loop results to make sure that the threads are all processed.
            for _ in results:
                pass
            thread_pool.close()
            thread_pool.join()
Пример #5
0
def _download_all(items):
    """Async download of the files.

       Example: [(url, quality, file_path)]

    """

    global WORKERS
    # Don't start more workers then 1:1
    if WORKERS < len(items):
        WORKERS = len(items)

    pool = ThreadPool(WORKERS)
    chunks = 1  # TODO
    # 1 ffmpeg is normally 10x- 20x * 2500kbits ish
    # so depending on how many items you download and
    # your bandwidth you might need to tweak chunk

    results = pool.imap_unordered(dl, items, chunks)
    try:
        for j in tqdm.tqdm(results, total=len(items)):
            pass
    finally:
        pool.close()
        pool.join()
Пример #6
0
def _maybe_convert_set(extracted_dir, source_csv, target_csv):
    print()
    if path.exists(target_csv):
        print('Found CSV file "%s" - not importing "%s".' % (target_csv, source_csv))
        return
    print('No CSV file "%s" - importing "%s"...' % (target_csv, source_csv))
    samples = []
    with open(source_csv) as source_csv_file:
        reader = csv.DictReader(source_csv_file)
        for row in reader:
            samples.append((row['filename'], row['text']))

    # Mutable counters for the concurrent embedded routine
    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
    lock = RLock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        mp3_filename = path.join(*(sample[0].split('/')))
        mp3_filename = path.join(extracted_dir, mp3_filename)
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        file_size = path.getsize(wav_filename)
        with lock:
            if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames/SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, sample[1]))
            counter['all'] += 1

    print('Importing mp3 files...')
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    print('Writing "%s"...' % target_csv)
    with open(target_csv, 'w') as target_csv_file:
        writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript in bar(rows):
            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })

    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
Пример #7
0
    def upfront_scan_hosts(self, hosts, command_label):
        logger.verbose("scan_hosts() - command label : " + command_label )
        pool = ThreadPool(self.args.threadPool)
        self.phase_commands = []
        nmap_path = os.path.join(self.args.outputFolder, __nmap_folder__)
        # Check folder for existing service
        if not os.path.exists(nmap_path):
            os.makedirs(nmap_path)
        for host in hosts:
            command_keys = {
                'output': os.path.join(nmap_path, command_label.replace(" ","_")+"_"+host.strip().replace(".", "_") ),
                'target': host.strip()}
            command = self.prepare_command( command_label ,command_keys )
            base, filename = os.path.split(command_keys['output']) # Resume file already exists
            if not self.args.noResume and self.find_files(base, filename + ".*").__len__() > 0:
                logger.verbose("scan_hosts() - RESUME - output file already exists: "
                               + command_keys['output'])
            else:
                self.phase_commands.append(command)
                logger.debug("scan_hosts() - command : " + command)

        #results = pool.map(self.execute_scan, self.phase_commands)
        for _ in bar(pool.imap_unordered(self.execute_command, self.phase_commands), expected_size=len(self.phase_commands)):
            pass
        pool.close()
        pool.join()
def check_vm_connectivity(env, os_conn, vm_keypair=None, timeout=4 * 60):
    """Check that all vms can ping each other and public ip"""
    ping_plan = {}
    exc = []

    def check(args):
        server, ips_to_ping = args
        try:
            check_ping_from_vm(env, os_conn, server, vm_keypair, ips_to_ping,
                               timeout=timeout)
        except AssertionError as e:
            return e

    servers = os_conn.get_servers()
    for server1 in servers:
        ips_to_ping = [settings.PUBLIC_TEST_IP]
        for server2 in servers:
            if server1 == server2:
                continue
            ips_to_ping += os_conn.get_nova_instance_ips(
                server2).values()
        ping_plan[server1] = ips_to_ping
    p = Pool(len(ping_plan))
    for result in p.imap_unordered(check, ping_plan.items()):
        if result is not None:
            exc.append(result)
    if len(exc) > 0:
        raise MultipleAssertionErrors(exc)
def get_all_packages(save_to, batch_size=1000):

    num_exceptions = 0
    collected_data = []

    pool = ThreadPool(multiprocessing.cpu_count())
    file_batches = _batch_list(batch_size, all_packages)

    print('Getting all packages...')
    for batch_results, n_exceptions in tqdm(
            pool.imap_unordered(_get_packages_worker, file_batches),
            total=len(all_packages) // batch_size + 1):

        # Add to global list
        collected_data.extend(batch_results)
        num_exceptions += n_exceptions

    pool.close()
    pool.join()

    # Log and save at completion
    print(
        f'There were {num_exceptions} exceptions out of {len(all_packages)} requests.'
    )
    print(f'Saving data to /data...')

    with open(save_to, 'w', encoding='utf-8') as f:
        ujson.dump(
            {
                "data": collected_data,
                "timestamp": time.time(),
                "pypi_api_url": api_url + "/<PACKAGE_NAME>/json",
            }, f)

    print(f'Saved data to: {save_to}')
    def _initialize_len(self):
        """

        """
        print("Initializing Stream")
        if self.jobs == 1:
            lengths = list(
                tqdm(map(self._get_len, enumerate(self.filenames)),
                     total=len(self.filenames),
                     file=sys.stdout,
                     desc="Post Counter"))
        else:
            mp = Threads(self.jobs)
            lengths = list(
                tqdm(mp.imap_unordered(self._get_len,
                                       enumerate(self.filenames)),
                     total=len(self.filenames),
                     file=sys.stdout,
                     desc="Post Counter"))
            _ = mp.close()
        if self.kind == "post":
            self.len_ = sum([i[1] for i in lengths])
        else:
            self.len_ = len([i for i in lengths if i[1] > 0])
        self.filenames = [self.filenames[i[0]] for i in lengths if i[1] > 0]
Пример #11
0
def add_items_concurrently(users):
    print("Sending add item requests...")
    p = Pool(len(users))
    start = time.time()
    for response in p.imap_unordered(add_item, users):
        print("{} (Time elapsed: {}s)".format(response,
                                              int(time.time() - start)))
Пример #12
0
def login_concurrently(users):
    print("Sending login requests...")
    p = Pool(len(users))
    start = time.time()
    for response in p.imap_unordered(login, users):
        print("{} (Time elapsed: {}s)".format(response,
                                              int(time.time() - start)))
def processJobs(jobs, concurrentTasks, sortOutput=False):
    job_count = len(jobs)
    logging.info("Processing {} job(s) with a concurrency of {}".format(
        job_count, concurrentTasks))

    if RANDOMIZE_JOBS: shuffle(jobs)

    pool = Pool(concurrentTasks)
    try:
        job_progress = 0
        for x in tqdm(pool.imap_unordered(worker, jobs), total=len(jobs)):
            job_progress += 1
            logging.info("{} out of {} staged jobs remaining".format(
                job_count - job_progress, job_count))
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        printAndLog(
            "\nReceived keyboard interrupt. Cleaning up and exiting...")
        pool.terminate()
        cleanup()
        sys.exit(1)
    except SystemExit:
        pool.terminate()
        sys.exit(1)
    if sortOutput: cleanup()
    print("\n")
Пример #14
0
def apply_tokenizer(filenames,
                    cache_dir,
                    min_n=1,
                    max_n=1,
                    min_date=None,
                    max_date=None,
                    remove_retweets=False,
                    jobs=4):
    """

    """
    ## Tokenizer
    helper = partial(load_and_tokenize,
                     min_n=min_n,
                     max_n=max_n,
                     min_date=min_date,
                     max_date=max_date,
                     remove_retweets=remove_retweets,
                     cache_dir=cache_dir,
                     pretokenized=False)
    ## Initialize Pool
    mp = Pool(jobs)
    filenames = list(
        tqdm(mp.imap_unordered(helper, filenames),
             desc="Tokenizer",
             total=len(filenames),
             file=sys.stdout))
    _ = mp.close()
    ## Filename Map
    filenames = dict((y, x) for x, y in filenames)
    ## Return Filenames
    return filenames
Пример #15
0
def scrape(array, function, threads):
    # Define the number of threads
    pool = ThreadPool(threads)
    # Tell the user what is happening
    print(
        f"Scraping {len(array)} items using {function} on {threads} threads.")
    # Calls function() and adds the filesize returned each call to an array called filesizes

    result = (pool.imap_unordered(function, array))
    pool.close()

    # Display progress as the scraper runs its processes
    while (len(array) > 1):
        completed = result._index

        # Break out of the loop if all tasks are done or if there is only one task
        if (completed == len(array)):
            sys.stdout.flush()
            sys.stdout.write('\r' + "")
            sys.stdout.flush()
            break

        # Avoid a ZeroDivisionError
        if completed > 0:
            sys.stdout.flush()
            sys.stdout.write(
                '\r' +
                f"{completed/len(array)*100:.0f}% done. {len(array)-completed} left. "
            )
            sys.stdout.flush()
        sys.stdout.flush()

    pool.join()
    return list(result)
Пример #16
0
    def load(cls, docs):
        """Force load the provided docs to read from file system."""
        if not docs:
            return

        pod = docs[0].pod

        def load_func(doc):
            """Force the doc to read the source file."""
            # pylint: disable=pointless-statement
            doc.has_serving_path()  # Using doc fields forces file read.

        with pod.profile.timer('DocsLoader.load'):
            if ThreadPool is None or len(docs) < cls.MIN_POOL_COUNT:
                for doc in docs:
                    load_func(doc)
                return
            pool_size = min(cls.MAX_POOL_SIZE, len(docs) * cls.POOL_RATIO)
            pool_size = int(round(pool_size))
            thread_pool = ThreadPool(pool_size)
            results = thread_pool.imap_unordered(load_func, docs)
            # Loop results to make sure that the threads are all processed.
            for _ in results:
                pass
            thread_pool.close()
            thread_pool.join()
Пример #17
0
def main(args):
    print(args)
    pool = Pool()
    protein_name = os.path.splitext(os.path.basename(args.file))[0]
    with open(args.settings, 'rb') as f:
        reader = csv.reader(f, delimiter=';')
        header = reader.next()
        col_idx = dict(itertools.izip(header, xrange(len(header))))
        # Now we can get a column index by name: `col_idx['Age']`
        settings_list = [row for row in reader]
    
    commands = list()
    for row in settings_list:
        dab_shift = int(row[col_idx['DAB shift']])
        hem_shift = int(row[col_idx['HEM shift']])
        fileout = os.path.join(args.out, protein_name + "_d%d-h%d.csv" % (dab_shift, hem_shift))
        shstr = "python2 cli_hpa.py %s %f --dab-shift %d --hem-shift %d --mp-disable --quiet --out %s" % (
            args.file, args.scale, dab_shift, hem_shift, fileout)
        commands.append(shstr)
    print(commands)
#     quit()
    for i, returncode in enumerate(pool.imap_unordered(partial(subprocess.call, shell=True), commands)):
        print("Let's play! %d" % i)
        if returncode != 0:
            print("%d command failed: %d" % (i, returncode))
Пример #18
0
def _maybe_convert_set(target_csv):
    def one_sample(sample):
        if is_audio_file(sample):
            sample = os.path.join(target_csv, sample)

            y, sr = librosa.load(sample, sr=16000)

            # Trim the beginning and ending silence
            yt, index = librosa.effects.trim(y)  # pylint: disable=unused-variable

            duration = librosa.get_duration(yt, sr)
            if duration > MAX_SECS or duration < MIN_SECS:
                os.remove(sample)
            else:
                librosa.output.write_wav(sample, yt, sr)

    samples = sorted(os.listdir(target_csv))

    num_samples = len(samples)

    print(f"Converting wav files to {SAMPLE_RATE}hz...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()
Пример #19
0
def main():
    start = 0
    end = 5000
    problems = download_tasks_with_tags()
    problems_batch = problems['result']['problems'][start:end]
    thread_pool = ThreadPool(10)
    list(thread_pool.imap_unordered(process_one_problem, problems_batch))
Пример #20
0
class Attack():
    def __init__(self, template, start, end, payload=""):
        self.template = template
        self.start = start
        self.end = end
        self.payload = payload
        self.range = int(end.bit_length() / 4)
        self.pool = Pool(5)

    def hex(self, num):
        num = f"{num:x}"
        return str(num).zfill(self.range)

    def f(self, l):
        t = self.template.format(self.hex(l))
        x = gzdeflate(unhexlify(t))
        return (x, t)

    def attack(self):
        for result in self.pool.imap_unordered(self.f,
                                               range(self.start,
                                                     self.end + 1)):
            if self.payload.encode() in result[0]:
                print("{},{}".format(self.payload, result[1]))
                return result[1]
Пример #21
0
    def download_all(self, threads=32) -> None:
        '''Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list'''
        pool = ThreadPool(threads)
        lst = self.get_urls()
        print(
            f"Downloading {len(lst)} items using {self.download_zip} in {threads} processes."
        )
        result = (pool.imap_unordered(self.download_zip, lst))
        pool.close()

        # Display progress as the scraper runs its processes
        while (len(lst) > 1):
            completed = result._index
            # Break out of the loop if all tasks are done or if there is only one task
            if (completed == len(lst)):
                sys.stdout.flush()
                sys.stdout.write('\r' + "")
                sys.stdout.flush()
                break
            # Avoid a ZeroDivisionError
            if completed > 0:
                sys.stdout.flush()
                sys.stdout.write(
                    '\r' +
                    f"{completed/len(lst)*100:.0f}% done. {len(lst)-completed} left. "
                )
                sys.stdout.flush()
            sys.stdout.flush()
        pool.join()
        return list(result)
Пример #22
0
    def render_images(self, annotations, predictions, images_dir):
        """Runs render script to render images and store them into images_dir

        Args:
            annotations (list of tuples: (formula, file_idx, folder_path)): Ground-truth formula
            predictions (list of tuples: (formula, file_idx, folder_path)): Predicted formula
        """
        out_path_gold = os.path.join(images_dir, 'images_gold')
        out_path_pred = os.path.join(images_dir, 'images_pred')
        for dir_ in [out_path_gold, out_path_pred]:
            if not os.path.exists(dir_):
                os.makedirs(dir_)
        annotations = [(elem[0], elem[1], out_path_gold)
                       for elem in annotations]
        predictions = [(elem[0], elem[1], out_path_pred)
                       for elem in predictions]
        lines = annotations + predictions
        print('Creating render pool with {} threads'.format(self.num_threads))
        pool = ThreadPool(self.num_threads)
        print('Jobs running...')
        pairs_images_rendered = 0
        for num, _ in enumerate(pool.imap_unordered(render_routine, lines)):
            if num % (PRINT_FREQ * 2) == 0 and num != 0:
                pairs_images_rendered += PRINT_FREQ
                # 2x PRINT_FREQ because images are rendered by pairs (original + predicted)
                print('{} / {} images rendered'.format(pairs_images_rendered,
                                                       len(lines) // 2))
        print('All images rendered')
        pool.close()
        pool.join()
        return out_path_gold, out_path_pred
Пример #23
0
    def ComputeTimelineBasedMetrics(self):
        assert not self._current_page_run, 'Cannot compute metrics while running.'

        def _GetCpuCount():
            try:
                return multiprocessing.cpu_count()
            except NotImplementedError:
                # Some platforms can raise a NotImplementedError from cpu_count()
                logging.warn('cpu_count() not implemented.')
                return 8

        runs_and_values = self._FindRunsAndValuesWithTimelineBasedMetrics()
        if not runs_and_values:
            return

        # Note that this is speculatively halved as an attempt to fix
        # crbug.com/953365.
        threads_count = min(_GetCpuCount() / 2 or 1, len(runs_and_values))
        pool = ThreadPool(threads_count)
        try:
            for result in pool.imap_unordered(_ComputeMetricsInPool,
                                              runs_and_values):
                self._AddPageResults(result)
        finally:
            pool.terminate()
            pool.join()
Пример #24
0
    def _http_requests_pool(self, urls, workers=10, chunk=None):
        """Generator function to request urls in chunks"""
        # From cpython
        if chunk is None:
            chunk, extra = divmod(len(urls), workers * 4)
            if extra:
                chunk += 1
            if len(urls) == 0:
                chunk = 0

        if len(urls) == 1:
            yield self._http_requests_single(urls[0])
        else:
            pool = ThreadPool(workers)

            try:
                for work in pool.imap_unordered(self._http_requests_single,
                                                urls, chunk):
                    yield work
            except Exception as e:
                if not self._silent:
                    logger.error("Failed to yield request: %s" % e)
            finally:
                pool.close()
                pool.join()
Пример #25
0
    def render_images(self, annotations, predictions, images_dir):
        """Runs render script to render images and store them into images_dir

        Args:
            annotations (str): Ground-truth formula
            predictions (str): Predicted formula
        """
        out_path_gold = os.path.join(images_dir, 'images_gold')
        out_path_pred = os.path.join(images_dir, 'images_pred')
        for dir_ in [out_path_gold, out_path_pred]:
            if not os.path.exists(dir_):
                os.makedirs(dir_)
        lines_gold = [(ann.label, ann.identifier, out_path_gold)
                      for ann in annotations]
        lines_pred = [(pred.label, pred.identifier, out_path_pred)
                      for pred in predictions]
        lines = lines_gold + lines_pred
        logging.info('Creating render pool with %s threads', self.num_threads)
        pool = ThreadPool(self.num_threads)
        logging.info('Jobs running...')
        pairs_images_rendered = 0
        for num, _ in enumerate(pool.imap_unordered(render_routine, lines)):
            if num % (PRINT_FREQ * 2) == 0 and num != 0:
                pairs_images_rendered += PRINT_FREQ
                # 2x PRINT_FREQ because images are rendered by pairs (original + predicted)
                print_info("{} / {} images rendered".format(
                    pairs_images_rendered,
                    len(lines) // 2))
        print_info("All images rendered")
        pool.close()
        pool.join()
Пример #26
0
    def run(self):
        mkdir_p(self.intermediate_folder)
        mkdir_p(self.output_folder)

        color_values = self._extract_colors()
        self.logger.debug('Found {} unique colors: {}'.format(
            len(color_values), color_values))

        manifest = {}

        def render_color(color):
            file_name = self._export_stl(color)
            manifest[file_name] = ColoredStlExporter.parse_openscad_color(
                color)

        pool = Pool()
        for _ in pool.imap_unordered(render_color, color_values):
            # Consume results as they occur so any exception is rethrown
            pass
        pool.close()
        pool.join()

        with open(os.path.join(self.output_folder, 'manifest.json'),
                  'wb') as f:
            f.write(json.dumps(manifest, indent=4))
Пример #27
0
def get_durations(paths, print_detail=True):
    duration_all = 0
    duration_book = defaultdict(list)

    pool = Pool()
    iterator = pool.imap_unordered(get_duration, paths)
    for dataset, duration in tqdm(iterator, total=len(paths)):
        duration_all += duration
        duration_book[dataset].append(duration)

    total_count = 0
    for book, duration in duration_book.items():
        if book:
            time = second_to_hour(sum(duration))
            file_count = len(duration)
            total_count += file_count

            if print_detail:
                print(" [*] Duration of {}: {} (file #: {})". \
                        format(book, time, file_count))

    print(" [*] Total Duration : {} (file #: {})". \
            format(second_to_hour(duration_all), total_count))
    print()
    return duration_all
Пример #28
0
    def get_continue_cut_multiprocessing(self,
                                         data,
                                         multiprocessing_type=1
                                         ):  #默认1为多进程,其他为多线程
        logging.info('多进程版-连续变量最优分组进行中。。。')
        self.save_data(data)
        if multiprocessing_type == 1:
            logging.info('已启用多进程,最优分箱进行中。。。')
            pool = Pool(multiprocessing.cpu_count())  # 设置进程数一般为cpu数量
        else:
            logging.info('已启用多线程,最优分箱进行中。。。')
            pool = ThreadPool(multiprocessing.cpu_count() *
                              2)  # 设置线程数一般为cpu的2倍
        cols = [col for col, col_type in self.col_type if col_type == 1]
        # pool.imap_unordered(self.get_cut_all_not_null_multiprocessing, cols)
        for i in tqdm(pool.imap_unordered(
                self.get_cut_all_not_null_multiprocessing, cols),
                      total=len(cols),
                      leave=False):
            pass
        pool.close()
        pool.join()
        self.transform_cut_points_list()

        self.del_data()
        logging.info('多进程版-连续变量最优分组完成!')
Пример #29
0
def create_accounts_concurrently(users):
    print("Sending create account requests...")
    p = Pool(min(len(users), 5 * multiprocessing.cpu_count()))
    start = time.time()
    for response in p.imap_unordered(create_account, users):
        print("{} (Time elapsed: {}s)".format(response,
                                              int(time.time() - start)))
Пример #30
0
def vectorize_files(filenames,
                    min_date=None,
                    max_date=None,
                    min_n=1,
                    max_n=1,
                    remove_retweets=False,
                    jobs=4,
                    pretokenized=False):
    """

    """
    ## Initialize Helper
    vectorizer = partial(_vectorize_file,
                         min_date=min_date,
                         max_date=max_date,
                         min_n=min_n,
                         max_n=max_n,
                         remove_retweets=remove_retweets,
                         pretokenized=pretokenized)
    ## Vectorize using Multiprocessing
    mp = Pool(jobs)
    results = list(
        tqdm(mp.imap_unordered(vectorizer, filenames),
             total=len(filenames),
             desc="Vectorizing Files",
             file=sys.stdout))
    _ = mp.close()
    ## Parse Results
    filenames = [r[0] for r in results]
    X = vstack(r[1] for r in results)
    return filenames, X
Пример #31
0
    def getoutput_ManyJobs(self, listOfJobids):
        """Waits for a job to complete and then returns its standard output
        and standard error data if the files were given default names.
        """

        pool = Pool()
        for (jobid, hasFinished) in pool.imap_unordered(
                self.waitUntilSignalOfEnd,
                tuple(jobid for jobid in listOfJobids)):
            if hasFinished:
                print('return logs of job', jobid, file=sys.stderr)
            else:
                print('job',
                      jobid,
                      'has not finished, over max allowed running time',
                      file=sys.stderr)
            signalOfEndFileName = self.signalOfEndFileName % str(jobid)
            try:
                os.remove(signalOfEndFileName)
            except:
                pass
            outFileName = self.outFileName % str(jobid)
            errFileName = self.errFileName % str(jobid)
            yield jobid, outFileName, errFileName

        os.remove(self.wrapperExecFileName)
Пример #32
0
def ComputeTimelineBasedMetrics(results):
    """Compute TBMv2 metrics on all story runs in parallel."""
    assert not results.current_story_run, 'Cannot compute metrics while running.'

    def _GetCpuCount():
        try:
            return multiprocessing.cpu_count()
        except NotImplementedError:
            # Some platforms can raise a NotImplementedError from cpu_count()
            logging.warn('cpu_count() not implemented.')
            return 8

    available_runs = list(run for run in results.IterRunsWithTraces()
                          if run.tbm_metrics)
    if not available_runs:
        return

    # Note that this is speculatively halved as an attempt to fix
    # crbug.com/953365.
    threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs))
    pool = ThreadPool(threads_count)
    metrics_runner = lambda run: _ComputeMetricsInPool(run, results.label,
                                                       results.upload_bucket)

    try:
        for result in pool.imap_unordered(metrics_runner, available_runs):
            results.AddMetricPageResults(result)
    finally:
        pool.terminate()
        pool.join()
Пример #33
0
def main():
    urls = [
      'http://www.python.org',
      'https://stackoverflow.com/',
      'https://css-tricks.com/',
      'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference',
      'https://dev.twitter.com/',
      'https://d3js.org/',
      'https://www.heroku.com/',
      'https://docs.pytest.org/en/latest/',
      'https://www.djangoproject.com/',
      'https://pudding.cool/',
      'https://caniuse.com/',
      'http://svgpocketguide.com/book/',
      'https://www.w3.org/TR/SVG/intro.html',
      ]

    pool = Pool()
    start = time.time()
    for x, y in pool.imap_unordered(url_name, urls):
        index = urls.index(y)
        log.info("{}s (sleep: {}) (#{} in array) for {})"
                 .format(int(time.time() - start), x, index, y))
    pool.close()
    pool.join()
    def parse_page(self, html):
        """
            Parse the log-in page and extract links.
            Args:
                html: log-in page content
        """

        parser = LinkParser()
        parser.feed(html)

        if not len(LINKS):
            print '\n no links extracted from log-in page\n'
            sys.exit(1)

        print '\n %i links found in first page ...' % len(LINKS)

        start = timer()
        pool = ThreadPool(NUMBER_THREADS)
        results = pool.imap_unordered(self.parse_page_links, LINKS)
        pool.close()
        pool.join()

        print '\n %i links found in spidered pages' % len(LINKS)
        print '\n link searches: %s secs\n' % str.format(
            '{0:.3f}', (timer() - start))
Пример #35
0
def download_external_resources(container,
                                urls,
                                timeout=60,
                                progress_report=lambda url, done, total: None):
    failures = {}
    replacements = {}
    data_uri_map = {}
    with TemporaryDirectory('editor-download') as tdir:
        pool = Pool(10)
        with closing(pool):
            for ok, result in pool.imap_unordered(
                    partial(download_one, tdir, timeout, progress_report,
                            data_uri_map), urls):
                if ok:
                    url, suggested_filename, downloaded_file, mt = result
                    with lopen(downloaded_file, 'rb') as src:
                        name = container.add_file(suggested_filename,
                                                  src,
                                                  mt,
                                                  modify_name_if_needed=True)
                    replacements[url] = name
                else:
                    url, err = result
                    failures[url] = err
    return replacements, failures
Пример #36
0
    def _general_processor(
            cls: Type[Artwork],
            item_ids: List[int]) -> Tuple[List[Artwork], List[int]]:
        util.log(texts.ARTWORK_ID_PROCESSING, start=os.linesep, inform=True)
        total = len(item_ids)
        successes = []
        fails = []
        pool = Pool()

        def process_item(item_id_):
            try:
                successes.append(cls(item_id_))
            except ArtworkError:
                fails.append(item_id_)

        for index, item_id in enumerate(
                pool.imap_unordered(process_item, item_ids), 1):
            util.print_progress(index,
                                total,
                                msg=texts.GUI_ID_PROCESSING_HEADING)
        msg = texts.ARTWORK_ID_PROCESS_RESULT.format(total=total,
                                                     successes=len(successes),
                                                     fails=len(fails))
        util.print_done(msg)
        return successes, fails
Пример #37
0
    def _http_requests_pool(self, urls, workers=10, chunk=None):
        """Generator function to request urls in chunks"""
        # From cpython
        if chunk is None:
            chunk, extra = divmod(len(urls), workers * 4)
            if extra:
                chunk += 1
            if len(urls) == 0:
                chunk = 0

        if self.ssl_verify:
            session = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                          ca_certs=certifi.where())
        else:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
            session = urllib3.PoolManager()
        part = partial(self._http_requests_urllib3, session=session)

        if len(urls) == 1:
            yield part(urls[0])
        else:
            pool = ThreadPool(workers)

            try:
                for work in pool.imap_unordered(part, urls, chunk):
                    yield work
            except Exception as e:
                logger.error(u"Failed to yield request: %s" % e)
            finally:
                pool.close()
                pool.join()
Пример #38
0
def process(lst: list, function, processes: int):
    """Handles multiprocessing using ThreadPool; sends items from a list to a function and gets the results as a list"""
    # Define the number of processes, use less than or equal to the defined value
    count_threads = min(processes, len(lst))
    if count_threads == 0:
            return []
    pool = ThreadPool(count_threads)

    # Tell the user what is happening
    print(f"Copying {len(lst)} items using {function} in {count_threads} processes.")

    # Calls function() and returns True for success and False for fail each call to a lst
    result = (pool.imap_unordered(function, lst))
    pool.close()

    # Display progress as the scraper runs its processes
    while (len(lst) > 1):
        completed = result._index

        # Break out of the loop if all tasks are done or if there is only one task
        if (completed == len(lst)):
            sys.stdout.flush()
            sys.stdout.write('\r' + "")
            sys.stdout.flush()
            break

        # Avoid a ZeroDivisionError
        if completed > 0:
            sys.stdout.flush()
            sys.stdout.write('\r' + f"{completed/len(lst)*100:.0f}% done. {len(lst)-completed} left. ")
            sys.stdout.flush()
        sys.stdout.flush()

    pool.join()
    return list(result)
Пример #39
0
def img_rescaler(dir_in, extension_in, threads=1):
    """ 
    Import an image, rescale it to normal UBYTE (0-255, 8 bit) range, and re-save it.
    
    """

    dir_out = os.path.join(dir_in, "rescaled")
    
    total_files = 0
    for path, folder, filename in os.walk(dir_in):
        if dir_out not in path:
            for f in filename:
                if f.endswith(extension_in):
                    total_files += 1
    print("\nYou have {} images to analyze".format(total_files))
    
    for path, folder, filename in os.walk(dir_in):
        if dir_out not in path:   # Don't run in the output directory.

            # Make directory for saving objects
            subpath = path[len(dir_in)+1:]
            if not os.path.exists(os.path.join(dir_out, subpath)):
                os.mkdir(os.path.join(dir_out, subpath))

            # What we'll do:
            global _core_fn  # bad form for Pool.map() compatibility
            def _core_fn(filename):
                if filename.endswith(extension_in):
                    # count progress.

                    path_in = os.path.join(path, filename)
                    subpath_in = os.path.join(subpath, filename) # for printing purposes
                    path_out = os.path.join(dir_out, subpath, filename)

                    if os.path.exists(path_out): #skip
                        print("\nALREADY ANALYZED: {}. Skipping...\n".format(subpath_in))

                    else: #(try to) do it
                        try:
                            img = io.imread(path_in)  # load image
                            img = img_as_ubyte(img / np.max(img))
                            io.imsave(path_out, img)
                        except:
                            print("Couldn't analyze {}".format(subpath_in))
                return()
            
            # run it
            sleep(1)  # to give everything time to  load
            thread_pool = Pool(threads)
            # Work on _core_fn (and give progressbar)
            tqdm.tqdm(thread_pool.imap_unordered(_core_fn,
                                                 filename,
                                                 chunksize=1),
                      total=total_files)
            # finish
            thread_pool.close()
            thread_pool.join()
    return()
Пример #40
0
def calc_factorials(max_int=100, pool_size=8, threads=True, chunk_size=10):
    
    if threads:
        pool = ThreadPool(pool_size)
    else:
        pool = ProcessPool(pool_size)
        
    results = pool.imap_unordered(factorial_calc, range(max_int), chunk_size)

    
    return results
Пример #41
0
def main():
    input_filename, output_dir, n_threads = parse_args()

    if not os.path.isdir(output_dir):
        print("Output directory {} does not exist".format(output_dir))
        sys.exit()

    with open(input_filename) as input_file:
        reader = csv.reader(input_file)
        header_row = next(reader)
        rows = list(reader)
    try:
        row_idx_image_id = header_row.index('ImageId')
        row_idx_url = header_row.index('URL')
        row_idx_x1 = header_row.index('x1')
        row_idx_y1 = header_row.index('y1')
        row_idx_x2 = header_row.index('x2')
        row_idx_y2 = header_row.index('y2')
    except ValueError as e:
        print('One of the columns was not found in the source file: ',
              e.message)

    rows = [(row[row_idx_image_id], row[row_idx_url], float(row[row_idx_x1]),
             float(row[row_idx_y1]), float(row[row_idx_x2]),
             float(row[row_idx_y2])) for row in rows]

    if n_threads > 1:
        pool = ThreadPool(n_threads)
        partial_get_images = partial(get_image, output_dir=output_dir)
        for i, _ in enumerate(pool.imap_unordered(partial_get_images, rows),
                              1):
            sys.stderr.write('\rDownloaded {0} images'.format(i + 1))
        pool.close()
        pool.join()
    else:
        failed_to_download = set()
        for idx in range(len(rows)):
            row = rows[idx]
            if not download_image(image_id=row[0],
                                  url=row[1],
                                  x1=float(row[2]),
                                  y1=float(row[3]),
                                  x2=float(row[4]),
                                  y2=float(row[5]),
                                  output_dir=output_dir):
                failed_to_download.add(row[row_idx_image_id])
            sys.stdout.write('\rDownloaded {0} images'.format(idx + 1))
            sys.stdout.flush()

        print()
        if failed_to_download:
            print('\nUnable to download images with the following IDs:')
            for image_id in failed_to_download:
                print(image_id)
Пример #42
0
def run_task_multi_thread(action_function, files, action_label, nb_threads=2, offset=0):
    """Run given action on every files using a threading pool.
       It uses a progress bar instead of a usual verbose log.
    """
    pool = Pool(processes=nb_threads)
    items = [(file, action_function) for file in files[offset:]]
    pool_iterable = pool.imap_unordered(run_single_task, items)
    progress_bar_items = tqdm(total=len(items),
                              iterable=pool_iterable,
                              unit='images',
                              desc='{0: <30}'.format(action_label))
    for item in progress_bar_items:
        pass
Пример #43
0
def match(bot, opponent):
	# List of match's results
	results = []

	# List of matches to perform
	matches = [[bot, opponent]] * args.count

	# Threads the matches and collect results
	pool = Pool(args.threads)
	for match in pool.imap_unordered(perform, matches):
		results.append(match)

	return stat_create(results)
Пример #44
0
class ScannerPool:
   # @classmethod
   # def getPool(cls):
   #     if "pool" not in cls.__dict__ or cls.pool is None:
   #         logger.info("Threads pool created with %d threads" % THREAD_NUMBER)
   #         cls.pool = Pool(THREAD_NUMBER)
    #        return cls.pool

    def __init__(self):
        self.pool = Pool(THREAD_NUMBER)


    def map(self, *args, **kwargs):
        return self.pool.imap_unordered(*args, **kwargs)
Пример #45
0
def build_common(out_name='common.a', build_dir='temp_build/temp_build', num_parallel=1):
    compiler = os.environ.get('CXX', 'g++')
    ar = os.environ.get('AR', 'ar')
    libtool = os.environ.get('LIBTOOL', 'libtool')
    cflags = os.environ.get('CFLAGS', '') + os.environ.get('CXXFLAGS', '')

    for file in COMMON_FILES:
        outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o')
        outdir = os.path.dirname(outfile)
        if not os.path.exists(outdir):
            print('mkdir', outdir)
            os.makedirs(outdir)

    def build_one(file):
        outfile = os.path.join(build_dir, os.path.splitext(file)[0] + '.o')
        if os.path.exists(outfile):
            return

        cmd = '{cc} -fPIC -c {cflags} {args} {includes} {infile} -o {outfile}'.format(
            cc=compiler,
            cflags=cflags,
            args=' '.join(ARGS),
            includes=' '.join('-I' + i for i in INCLUDES),
            infile=file,
            outfile=outfile,
        )
        print(cmd)
        subprocess.check_call(shlex.split(cmd))
        return outfile

    pool = Pool(num_parallel)
    obj_files = list(pool.imap_unordered(build_one, COMMON_FILES))

    if sys.platform.startswith('darwin'):
        cmd = '{libtool} -static -o {outfile} {infiles}'.format(
            libtool=libtool,
            outfile=out_name,
            infiles=' '.join(obj_files),
        )
        print(cmd)
        subprocess.check_call(shlex.split(cmd))
    else:
        cmd = '{ar} rcs {outfile} {infiles}'.format(
            ar=ar,
            outfile=out_name,
            infiles=' '.join(obj_files)
        )
        print(cmd)
        subprocess.check_call(shlex.split(cmd))
Пример #46
0
def download_external_resources(container, urls, timeout=60, progress_report=lambda url, done, total: None):
    failures = {}
    replacements = {}
    with TemporaryDirectory('editor-download') as tdir:
        pool = Pool(10)
        with closing(pool):
            for ok, result in pool.imap_unordered(partial(download_one, tdir, timeout, progress_report), urls):
                if ok:
                    url, suggested_filename, downloaded_file, mt = result
                    with lopen(downloaded_file, 'rb') as src:
                        name = container.add_file(suggested_filename, src, mt, modify_name_if_needed=True)
                    replacements[url] = name
                else:
                    url, err = result
                    failures[url] = err
    return replacements, failures
Пример #47
0
    def test_threaded(self):
        # add three more short subchains for threads to test on
        for ident in 'ghijklmno':
            obj = make_mock_relationship('test_db', 'schema', ident)
            self.cache.add(make_relation('dbt', 'schema', ident))

        self.cache.add_link(make_relation('dbt', 'schema', 'a'),
                            make_relation('dbt', 'schema', 'g'))
        self.cache.add_link(make_relation('dbt', 'schema', 'g'),
                            make_relation('dbt', 'schema', 'h'))
        self.cache.add_link(make_relation('dbt', 'schema', 'h'),
                            make_relation('dbt', 'schema', 'i'))

        self.cache.add_link(make_relation('dbt', 'schema', 'a'),
                            make_relation('dbt', 'schema', 'j'))
        self.cache.add_link(make_relation('dbt', 'schema', 'j'),
                            make_relation('dbt', 'schema', 'k'))
        self.cache.add_link(make_relation('dbt', 'schema', 'k'),
                            make_relation('dbt', 'schema', 'l'))

        self.cache.add_link(make_relation('dbt', 'schema', 'a'),
                            make_relation('dbt', 'schema', 'm'))
        self.cache.add_link(make_relation('dbt', 'schema', 'm'),
                            make_relation('dbt', 'schema', 'n'))
        self.cache.add_link(make_relation('dbt', 'schema', 'n'),
                            make_relation('dbt', 'schema', 'o'))

        pool = ThreadPool(4)
        results = list(pool.imap_unordered(self._target, ('b', 'g', 'j', 'm')))
        pool.close()
        pool.join()
        # at a minimum, we expect each table to "see" itself, its parent ('a'),
        # and the unrelated table ('a')
        min_expect = {
            'b': {'a', 'b', 'e'},
            'g': {'a', 'g', 'e'},
            'j': {'a', 'j', 'e'},
            'm': {'a', 'm', 'e'},
        }

        for ident, relations in results:
            seen = set(r.identifier for r in relations)
            self.assertTrue(min_expect[ident].issubset(seen))

        self.assert_has_relations(set('abgjme'))
Пример #48
0
def render_rotation(output_folder, num_frames, start_frame, variables):
    def render_frame(i):
        angle = 135 + i * 360 / num_frames
        openscad.run(
            'splitflap.scad',
            os.path.join(output_folder, 'frame_%05d.png' % (start_frame + i)),
            output_size = [320, 240],
            camera_translation = [0, 0, 0],
            camera_rotation = [60, 0, angle],
            camera_distance = 600,
            variables = variables,
            colorscheme = 'Nature',
        )
    pool = Pool()
    for _ in pool.imap_unordered(render_frame, range(num_frames)):
        # Consume results as they occur so any exception is rethrown
        pass
    pool.close()
    pool.join()
Пример #49
0
    def getoutput_ManyJobs(self, listOfJobids):
        """Waits for a job to complete and then returns its standard output
        and standard error data if the files were given default names.
        """

        pool = Pool()
        for (jobid, hasFinished) in pool.imap_unordered(self.waitUntilSignalOfEnd, tuple(jobid for jobid in listOfJobids)):
            if hasFinished:
                print >> sys.stderr, 'return logs of job', jobid
            else:
                print >> sys.stderr, 'job', jobid, 'has not finished, over max allowed running time'
            signalOfEndFileName = self.signalOfEndFileName % str(jobid)
            try:
                os.remove(signalOfEndFileName)
            except:
                pass
            outFileName = self.outFileName % str(jobid)
            errFileName = self.errFileName % str(jobid)
            yield jobid, outFileName, errFileName

        os.remove(self.wrapperExecFileName)
Пример #50
0
    def run(self):
        mkdir_p(self.intermediate_folder)
        mkdir_p(self.output_folder)

        color_values = self._extract_colors()
        self.logger.debug('Found {} unique colors: {}'.format(len(color_values), color_values))

        manifest = {}

        def render_color(color):
            file_name = self._export_stl(color)
            manifest[file_name] = ColoredStlExporter.parse_openscad_color(color)

        pool = Pool()
        for _ in pool.imap_unordered(render_color, color_values):
            # Consume results as they occur so any exception is rethrown
            pass
        pool.close()
        pool.join()

        with open(os.path.join(self.output_folder, 'manifest.json'), 'wb') as f:
            f.write(json.dumps(manifest, indent=4))
Пример #51
0
# http://stackoverflow.com/questions/16675803/learning-python-and-threading-i-think-my-code-runs-infinitely-help-me-find-bug

import json
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
import time

def get_name(url):
    try:
        return json.load(urllib2.urlopen(url))['gender']
    except Exception:
        return None # error

start = time.time()
urls = ('http://graph.facebook.com/%d' % i for i in xrange(200))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
print 'It took %s s' % (time.time() - start)
Пример #52
0
    for d in [1,2]:
        if int(run_log['hv dia%d'%d]) < 0:
            bias+='M'
        else:
            bias+='P'

    conf = ' -c %s/conf/converter%s.conf'%(pwd,bias)
    out = ' -o  /data/psi_2015_05/root/run%d.root'%run
    cmd += inp + conf + out
    print cmd

    commands.append((run,cmd))

exit
pool = Pool(nProcesses)
it = pool.imap_unordered(partial(call, shell=True), [c[1] for c in commands])
failures = []
complete = []

for i, returncode in enumerate(it):
    # print multiprocessing.active_children()
    if returncode != 0:
        print("Command '%s'  failed: %d" % (commands[i], returncode))
        failures.append(commands[i][0])
    else:
        complete.append(commands[i][0])
        print("Command '%s'  completed: %d" % (commands[i], returncode))
print 'completed:',complete

print 'Failures:',failures
# Example function that takes a record and returns some components of the genbank accessions
def get_record(record):
    try:
        handle = Entrez.efetch(db="protein",id=record, retmode="xml")
        record = Entrez.read(handle)
        organism = record[0]["GBSeq_source"]
        taxon =  record[0]["GBSeq_taxonomy"]
    except:
        return record,'error'
    return organism, taxon


# For counting iterations
z=0
total = len(recordList)

# Pool(n) will return n separate threads.
pool = Pool(20) # at most 20 concurrent downloads

# Open a file for writing:
with open("/Users/jimbo/Desktop/example.txt", "wb") as f:
    # Call imap_unordered on the pool of processors you opened, and pass a function(x) and a list of x's
    for org,tax in pool.imap_unordered(get_record, recordList):
        # Write output line by line as results come in from pool.
        f.write(org+"\t"+tax+"\n")
        
        # Interactive output to check status
        if z%1000==0:
            print '{0} down, {1} to go'.format(z, total-z) 
Пример #54
0
def _maybe_convert_set(audio_dir, input_tsv):
    output_csv =  path.join(audio_dir,os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)

    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    with open(input_tsv) as input_tsv_file:
        reader = csv.DictReader(input_tsv_file, delimiter='\t')
        for row in reader:
            samples.append((row['path'], row['sentence']))

    # Keep track of how many samples are good vs. problematic
    counter = { 'all': 0, 'too_short': 0, 'too_long': 0 }
    lock = RLock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        """ Take a audio file, and optionally convert it to 16kHz WAV """
        mp3_filename = path.join(audio_dir, sample[0])
        if not path.splitext(mp3_filename.lower())[1] == '.mp3':
            mp3_filename += ".mp3"
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
        frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        file_size = path.getsize(wav_filename)
        with lock:
            if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames/SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, sample[1]))
            counter['all'] += 1

    print("Importing mp3 files...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    with open(output_csv, 'w') as output_csv_file:
        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript in bar(rows):
            writer.writerow({ 'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript })

    print('Imported %d samples.' % (counter['all'] - counter['too_short'] - counter['too_long']))
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))
Пример #55
0
def main(dataset_dir, tests_file, only,
         host, port, user, password,
         client_opt='api', server_log=None, interval=30, force=False,
         concurrent=1, aggregate=False, verbose=0, just_print=False):

    setup_logging(verbose)

    if just_print:
        logging.warning(
            "This is a simulation,"
            " no command will actually be performed!")

    # remove trailing `/` if present, otherwise
    # `basename(dataset_dir)` returns the empty string
    if dataset_dir.endswith('/'):
        dataset_dir = dataset_dir[:-1]

    # collect datasets to be tested
    test_params = load_test_params(dataset_dir, tests_file)
    datasets_to_test = collect_datasets(dataset_dir, test_params, only)
    if not datasets_to_test:
        abort("No datasets to test!")
    logging.info("Will test datasets: %r", datasets_to_test)

    existing_experiments = list_experiment_names(host, port, user, password)

    # actual code to run the tests
    def do_test_dataset(dataset_path):
        name, params = get_experiment_data(dataset_path, test_params)
        experiment_name = params.pop('name')
        params['client'] = client_opt
        client = make_client(client_opt, host, port, user, password, experiment_name)

        # check if an experiment with this name already exists
        if experiment_name in existing_experiments:
            if force:
                with timing(
                        "deleting old experiment `{}` ..."
                        .format(experiment_name)):
                    if not just_print:
                        delete_experiment(host, port, user, password, experiment_name)
            else:
                logging.error(
                    "Experiment `%s` already exists."
                    " Remove it before re-running this test.",
                    experiment_name)
                return

        # start actual testing
        with _Testsuite(name, **params) as suite:
            run = Runner(suite, just_print, server_log)

            try:
                # create experiment
                run(client.create_experiment(
                    params['workflow_type'], params['microscope_type'],
                    params['plate_format'], params['plate_acquisition_mode']))

                for plate in params['plates']:
                    # create plate(s)
                    run(client.create_plate(plate))

                    # create and upload acquisition(s)
                    for acquisition in params['acquisitions'][plate]:
                        acquisition_dir = join(
                            dataset_path,
                            'plates', plate,
                            'acquisitions', acquisition)
                        run(client.create_acquisition(plate, acquisition))
                        run(client.upload_microscope_files(plate, acquisition, acquisition_dir))

                workflow_description_path = join(dataset_path, params.get('workflow_description_path', 'workflow_description.yaml'))
                run(client.upload_workflow_description_file(workflow_description_path))

                jterator_project_path = join(dataset_path, params.get('jterator_project_path', 'jterator'))
                run(client.upload_jterator_project_files(jterator_project_path))

                with suite.new_test_case("Running workflow") as case:
                    if not just_print:
                        run_workflow(
                            case, host, port, user, password,
                            experiment_name, server_log, interval)

            except Runner.Abort as err:
                logging.warning("%s", err)

            return suite

    # do (possibly) parallel processing
    report = Report(basename(dataset_dir))
    proc = Pool(processes=(concurrent or None))
    suites = proc.imap_unordered(do_test_dataset, datasets_to_test)
    errors = False
    for suite in suites:
        if not suite:
            # `do_test_dataset` errored out
            continue
        report.add_test_suite(suite)
        if not aggregate:
            if not just_print:
                write_junit_xml(report, tests_file, dataset_dir, suite.name)
            report.print_terminal_output()
            if report.errored > 0 or report.failed > 0:
                errors = True
            report.reset()
    if aggregate:
        if not just_print:
            write_junit_xml(report, tests_file, dataset_dir)
        report.print_terminal_output()
        if report.errored > 0 or report.failed > 0:
            errors = True
        report.reset()

    if errors:
        return 2
    else:
        return 0
Пример #56
0
def mergeFilesByRegion(filesByRegion, grid, outputDir):
    # Merge a set of files by region into the specified dir
    # Key is up/down/nominal etc
    N = 0
    filesToWrite = {}
    for r in filesByRegion:
        for key in filesByRegion[r]:
            if filesByRegion[r][key] == []:
                if key == "Nominal":
                    print ("WARNING: no input files for region {0} key {1}".format(r, key))
                continue

            filePrefix = "%s_%s" % (r, grid)
            filename = os.path.join(outputDir, "%s.root" % (filePrefix))
            if os.path.exists(filename):
                print ("Output file {0} exists - skipping".format(os.path.basename(filename)))
                continue

            filesToWrite[filename] = {"region": r, "files": filesByRegion[r][key]}
            N += 1

    # Got anything?
    if filesToWrite == {}:
        return

    # build the pool arguments
    args = []
    for filename in filesToWrite:
        N -= 1
        args.append((filename, filesToWrite[filename]["files"], False, filesToWrite[filename]["region"], N))

    pool = ThreadPool(8, init_worker)
    try:
        # results = pool.map(mergeFiles, args)
        results = pool.imap_unordered(mergeFiles, args)
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        print "Caught KeyboardInterrupt, terminating workers"
        pool.terminate()
        pool.join()

    return

    # Below is to be removed legacy code relying on hadd

    for r in filesByRegion:
        for key in filesByRegion[r]:
            if filesByRegion[r][key] == []:
                continue

            N -= 1

            # Merge the files in chunks of 50, and then merge these chunks

            # The whole idea behind this exercise is to avoid exceeding the maximum length of
            # of a command allowed in bash.

            filePrefix = "%s_%s" % (r, grid)
            filename = os.path.join(outputDir, "%s.root" % (filePrefix))
            if os.path.exists(filename):
                print ("Output file {0} exists - skipping".format(os.path.basename(filename)))
                continue

            mergeFiles(filename, filesByRegion[r][key])

            # fileMerger = ROOT.TFileMerger()
            # fileMerger.OutputFile(filename)
            # for f in filesByRegion[r][key]:
            #    fileMerger.AddFile(f)
            # fileMerger.Merge()

            # i=1
            # print("Attempting to make file {0}".format(filename))
            # for subset in chunks(filesByRegion[r][key], 50):
            #    print("Merging subset {0:d}...".format(i))
            #    filename = os.path.join(outputDir, "%s_%03d.root" % (filePrefix, i) )
            #    outputFiles.append(filename)
            #
            #    if len(subset) == 1:
            #        shutil.copy(subset[0], filename)
            #    else:
            #        cmd = "hadd -f %s %s" % (filename, " ".join(subset))
            #        subprocess.call(cmd, shell=True)
            #
            #    i+=1

            # print("Merging all subsets")
            # filename = os.path.join(outputDir, "%s.root" % (filePrefix) )

            # if len(outputFiles) == 1:
            #    # only 1 file, so just rename it
            #    os.rename(outputFiles[0], filename)
            # else:
            #    cmd = "hadd -f %s %s" % (filename, " ".join(outputFiles))
            #    subprocess.call(cmd, shell=True)

            # print("Done merging subsets; removing temporary files")
            # for f in outputFiles:
            #    if not os.path.exists(f): continue
            #    os.remove(f)

            print ("=> Created file for {0}; {1} files remaining".format(r, N))
Пример #57
0
class RenderLocaleBatch(object):
    """Handles the rendering and threading of the controllers."""

    BATCH_DEFAULT_SIZE = 300  # Default number of documents in a batch.

    def __init__(self, jinja_env, profile, tick=None, batch_size=None):
        self.batch_size = batch_size or self.BATCH_DEFAULT_SIZE
        self.jinja_env = jinja_env
        self.profile = profile
        self.tick = tick
        self.batches = [[]]
        self._is_loading = False
        self._is_rendering = False
        self._results = None
        self._thread_pool = None

    def __len__(self):
        count = 0
        for batch in self.batches:
            count = count + len(batch)
        return count

    def _get_batch(self):
        # Ensure that batch is not over the max size.
        batch = self.batches[len(self.batches) - 1]
        if len(batch) >= self.batch_size:
            self.batches.append([])
            batch = self.batches[len(self.batches) - 1]
        return batch

    def add(self, controller, *args, **kwargs):
        """Add an item to be rendered to the batch."""
        batch = self._get_batch()

        batch.append({
            'controller': controller,
            'jinja_env': self.jinja_env,
            'args': args,
            'kwargs': kwargs,
        })

    def load_start(self, source_dir):
        """Start the batches loading."""
        self._thread_pool = ThreadPool(len(self.batches))
        self._results = self._thread_pool.imap_unordered(
            load_func, self.batches, source_dir)
        self._is_loading = True

    def load_finish(self):
        """Finish in progress batches loading."""
        if not self._is_loading:
            raise RenderNotStartedError('Rendering was never started')

        load_errors = []
        loaded_docs = []

        for batch_result in self._results:
            load_errors = load_errors + batch_result.load_errors
            loaded_docs = loaded_docs + batch_result.loaded_docs
            if self.tick:
                for _ in batch_result.load_errors:
                    self.tick()
                for _ in batch_result.loaded_docs:
                    self.tick()
            for result in batch_result.loaded_docs:
                self.profile.add_timer(result.load_timer)

        self._thread_pool.close()
        self._thread_pool.join()
        self._is_loading = False

        return loaded_docs, load_errors

    def load_sync(self, source_dir):
        """Syncronous loading for non-threaded loading."""
        load_errors = []
        loaded_docs = []

        for batch in self.batches:
            batch_result = load_func(batch, source_dir, tick=self.tick)
            load_errors = load_errors + batch_result.load_errors
            loaded_docs = loaded_docs + batch_result.loaded_docs

        return loaded_docs, load_errors

    def render_start(self):
        """Start the batches rendering."""
        self._thread_pool = ThreadPool(len(self.batches))
        self._results = self._thread_pool.imap_unordered(
            render_func, self.batches)
        self._is_rendering = True

    def render_finish(self):
        """Finish in progress batches rendering."""
        if not self._is_rendering:
            raise RenderNotStartedError('Rendering was never started')

        render_errors = []
        rendered_docs = []

        for batch_result in self._results:
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs
            if self.tick:
                for _ in batch_result.render_errors:
                    self.tick()
                for _ in batch_result.rendered_docs:
                    self.tick()
            for result in batch_result.rendered_docs:
                self.profile.add_timer(result.render_timer)

        self._thread_pool.close()
        self._thread_pool.join()
        self._is_rendering = False

        return rendered_docs, render_errors

    def render_sync(self):
        """Syncronous rendering for non-threaded rendering."""
        render_errors = []
        rendered_docs = []

        for batch in self.batches:
            batch_result = render_func(batch, tick=self.tick)
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs

        return rendered_docs, render_errors
Пример #58
0
    sigscan = "std_T2tt"
    if len(sys.argv) > 2:
        sigscan = sys.argv[2]

    print "Doing limits from cards in ", carddir

    ext_cards = os.listdir(carddir)
    ext_cards = filter(lambda x : sigscan in x and "bin1.txt" in x, ext_cards)

    sigs = [ x[9:-9] for x in ext_cards]
    sigs = [ s for s in sigs if int(s.split('_')[2]) < 1500]

    pool = ThreadPool(40)
    cards = []

    for combined in pool.imap_unordered(combine_cards, sigs):
        cards.append(combined)

    if not cards:
        cards = os.listdir(combineddir)
        cards = filter(lambda x : '.txt' in x and '.log' not in x, cards)
        cards = [combineddir+'/'+c for c in cards]
        # print cards

    os.system('mkdir -p '+limitdir)
    limits = []
    for result in pool.imap_unordered(run_asymptotic, cards):
        limits.append(result)

    print limits
Пример #59
0
def _maybe_convert_sets(target_dir, extracted_data):
    extracted_dir = path.join(target_dir, extracted_data)
    # override existing CSV with normalized one
    target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME + '_' + ARCHIVE_NAME.replace('.zip', '_{}.csv'))
    if os.path.isfile(target_csv_template):
        return

    ogg_root_dir = os.path.join(extracted_dir, ARCHIVE_NAME.replace('.zip', ''))

    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    glob_dir = os.path.join(ogg_root_dir, '**/*.ogg')
    for record in glob(glob_dir, recursive=True):
        record_file = record.replace(ogg_root_dir + os.path.sep, '')
        samples.append((record_file, os.path.splitext(os.path.basename(record_file))[0]))

    # Keep track of how many samples are good vs. problematic
    counter = {'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0}
    lock = RLock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        """ Take a audio file, and optionally convert it to 16kHz WAV """
        ogg_filename = path.join(ogg_root_dir, sample[0])
        # Storing wav files next to the ogg ones - just with a different suffix
        wav_filename = path.splitext(ogg_filename)[0] + ".wav"
        _maybe_convert_wav(ogg_filename, wav_filename)
        file_size = -1
        if path.exists(wav_filename):
            file_size = path.getsize(wav_filename)
            frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT))
        label = label_filter(sample[1])
        with lock:
            if file_size == -1:
                # Excluding samples that failed upon conversion
                counter['failed'] += 1
            elif label is None:
                # Excluding samples that failed on label validation
                counter['invalid_label'] += 1
            elif int(frames/SAMPLE_RATE*1000/10/2) < len(str(label)):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames/SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append((wav_filename, file_size, label))
            counter['all'] += 1

    print("Importing ogg files...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    with open(target_csv_template.format('train'), 'w') as train_csv_file:  # 80%
        with open(target_csv_template.format('dev'), 'w') as dev_csv_file:  # 10%
            with open(target_csv_template.format('test'), 'w') as test_csv_file:  # 10%
                train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES)
                train_writer.writeheader()
                dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES)
                dev_writer.writeheader()
                test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES)
                test_writer.writeheader()

                for i, item in enumerate(rows):
                    transcript = validate_label(item[2])
                    if not transcript:
                        continue
                    wav_filename = os.path.join(ogg_root_dir, item[0].replace('.ogg', '.wav'))
                    i_mod = i % 10
                    if i_mod == 0:
                        writer = test_writer
                    elif i_mod == 1:
                        writer = dev_writer
                    else:
                        writer = train_writer
                    writer.writerow(dict(
                        wav_filename=wav_filename,
                        wav_filesize=os.path.getsize(wav_filename),
                        transcript=transcript,
                    ))

    print('Imported %d samples.' % (counter['all'] - counter['failed'] - counter['too_short'] - counter['too_long']))
    if counter['failed'] > 0:
        print('Skipped %d samples that failed upon conversion.' % counter['failed'])
    if counter['invalid_label'] > 0:
        print('Skipped %d samples that failed on transcript validation.' % counter['invalid_label'])
    if counter['too_short'] > 0:
        print('Skipped %d samples that were too short to match the transcript.' % counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS))