Пример #1
0
Файл: gen.py Проект: dahlia/blog
 def __init__(self,
              pool: multiprocessing.Pool,
              post_files: typing.Iterable[pathlib.Path],
              base_url: str=None,
              feed_url: str=None,
              outdate_epoch: typing.Optional[datetime.datetime]=None):
     self.logger = logging.getLogger('Blog')
     self.pool = pool
     self.base_url = base_url
     self._feed_url = feed_url
     self.logger.info('Loading posts...')
     self.posts = list(map(Post, post_files))
     # Loading published dates
     list(pool.imap_unordered(self._get_published_at, self.posts))
     # Loading titles
     list(pool.imap_unordered(operator.attrgetter('title'), self.posts))
     self.posts.sort(key=self._get_published_at)
     self.canon_posts = [p for p in self.posts if p.canon]
     self.logger.info('Total %d posts are loaded.', len(self.posts))
     self.current_base_path = './'
     self.outdate_epoch = outdate_epoch
     self.jinja2_env = Environment(loader=FileSystemLoader('templates'),
                                   extensions=['jinja2.ext.with_'],
                                   autoescape=True)
     self.jinja2_env.globals.update(
         blog=self,
         href_for=self.resolve_relative_url,
         outdate_epoch=self.outdate_epoch
     )
Пример #2
0
    def update(self, date):
        """Update index components (and weight) for the **same** day before market open."""
        CMD = sql.CMD1.format(date=date)
        self.logger.debug('Executing command:\n{}', CMD)
        self.cursor.execute(CMD)
        df1 = pd.DataFrame(list(self.cursor))
        if len(df1) == 0:
            self.logger.warning('No records found for {} on {}', self.db.index_components.name, date)
            return

        df1.columns = ['dname', 'market', 'sid']
        df1.dname = ['SH'+dname if mkt == 83 else 'SZ'+dname for mkt, dname in zip(df1.market, df1.dname)]
        df1.index = df1.sid

        CMD = sql.CMD2.format(date=date)
        self.logger.debug('Executing command:\n{}', CMD)
        self.cursor.execute(CMD)
        try:
            df2 = pd.DataFrame(list(self.cursor))
            df2.columns = ['dname', 'market', 'sid', 'weight']
            df2.dname = ['SH'+dname if mkt == 83 else 'SZ'+dname for mkt, dname in zip(df2.market, df2.dname)]
            df2.index = df2.sid
        except:
            df2 = None

        grouped = df1.groupby('dname')
        pool = Pool(self.threads)
        pool.imap_unordered(worker, [(date, dname, _df1, df2) for dname, _df1 in grouped], self.threads)
        pool.close()
        pool.join()

        self.logger.info('UPSERT documents for {} indice into (c: [{}]) of (d: [{}]) on {}', len(grouped), COLLECTION.name, self.db.name, date)
Пример #3
0
def make_epoch(n, train_true, train_false, val_true, val_false):
    n = n[0]
    train_false = list(train_false)
    val_false = list(val_false)
    np.random.shuffle(train_false)
    np.random.shuffle(val_false)

    n_train_true = len(train_true)
    n_val_true = len(val_true)

    train_epoch = train_true + train_false[:n_train_true*2] #*2 to account for 1 flip directions
    val_epoch = val_true + val_false[:n_val_true*2]

    train_epoch = combine_tups(train_epoch)
    val_epoch = combine_tups(val_epoch)

    print "Epoch {0} n files {1}&{2}".format(n, len(train_epoch), len(val_epoch))
    pool = Pool(processes=12)
    train_epoch_data = list(itertools.chain.from_iterable(pool.imap_unordered(load_data, train_epoch)))
    print "Epoch {0} done loading train".format(n)

    val_epoch_data = list(itertools.chain.from_iterable(pool.imap_unordered(load_data, val_epoch)))
    print "Epoch {0} done loading validation".format(n)
    pool.close()

    np.random.shuffle(train_epoch_data)
    return train_epoch_data, val_epoch_data
def main():
    args = docopt(__doc__)
    feature_name = args['<feature_name>']
    assert feature_name == 'words'
    assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys())
    c = get_config()
    experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name)

    print "Computing foreground group sums using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    fg_groups = experiment_set.list_foreground_groups()
    cache = {}
    try:
        for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)):
            cache[group_name] = sum_vector
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Computing background sums..."
    bg_groups = experiment_set.list_background_groups()
    for g in bg_groups:
        sum_vector = experiment_set.compute_background_group_sum(g, cache)
        cache[g] = sum_vector

    print "Saving sums to ZODB..."
    zodb_root = open_zodb(read_only=False)
    if getattr(zodb_root, 'group_sums', None) is None:
        zodb_root.group_sums = BTrees.OOBTree.OOBTree()
        transaction.commit()
    if feature_name not in zodb_root.group_sums:
        zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree()
        transaction.commit()
    for k, v in cache.iteritems():
        zodb_root.group_sums[feature_name][k] = v
    transaction.commit()


    print "Creating output db tables..."
    create_db(c.resultsdb_url)
    session_out = open_db(c.resultsdb_url)

    print "Computing overrepresentation using %d cores..." % c.num_cores
    exps = experiment_set.list_experiments()
    cls = experiment_set.result_table_class()
    try:
        for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)):
            for w, odds, pval in results:
                c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval)
                session_out.add(c)
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Committing..."
    session_out.commit()
    print "Done"
def main() :
    print "Title here"
    multiprocessing.freeze_support()
    PROCESSES = 4
    print '\r\n\tCreating pool with:\t%d processes' % PROCESSES
    pool = Pool(PROCESSES)
    print '\tNo. of cpu\'s present:\t%d cores' % cpu_count()
    procList = ["process1", "process2", "process3", "process4", "process5", "process6" ]
    for Name in procList :
        pool.imap_unordered(multi_run_wrapper,[(Name,variable1,variable2,variable3, variable4,variable5,variable6)])
    pool.close();    pool.join()
    def handle(self, *args, **options):
        user_count = options['count'] + 1
        users = range(1, user_count)
        versions = list(Version.objects.select_related('app', 'platform').filter_by_enabled())

        job_size = int(user_count / (cpu_count() or 1 * 2)) or 1
        job_data = [users[i:i + job_size] for i in range(0, len(users), job_size)]

        pool = Pool()
        pool.imap_unordered(partial(run_worker, versions=versions), job_data)
        pool.close()
        pool.join()
Пример #7
0
 def _go(self, num_procs, chunk_size = None):
     '''
     This is the equivalent of the main method. It will 
     create the processes and the pipeline between item generators -> 
     mappers -> a reducer.
     '''
     pool = None
     try:
         print('Initiating...', file=sys.stderr)
 
         igen = self.item_generator()
         reducer = self.reducer()
         mapper = self.mapper()
         
         if (num_procs > 1):
             print('Using %d processes' %num_procs, file=sys.stderr)
             
             def igen_helper():
                 '''
                 Helper generator to pass the mapper object
                 to each slave process
                 '''
                 for key, item in igen:
                     yield (mapper, key, item)
             
             pool = Pool(num_procs)
             results = None
             if not chunk_size:
                 results = pool.imap_unordered(_processor_helper, 
                                               igen_helper())
             else:
                 results = pool.imap_unordered(_processor_helper, 
                                               igen_helper(), chunk_size)
             
             for key, value in results:
                 reducer(key, value)
 
         else:
             print('Using one mapper only', file = sys.stderr)
                 
             for key, item in igen:
                 value = mapper(key, item)
                 reducer(key, value)
                 
         self.finalize()
         print('Done.', file = sys.stderr)
     finally:
         if pool:
             pool.close()
             pool.join()
Пример #8
0
def main():
    idir, ofile, dffile = _parse_cmdline()

    print u'Loading doc-freqs file {}...'.format(dffile)
    with open(dffile, 'rb') as f:
        df = pickle.load(f)    

    print u'Reading input directory: {}'.format(idir)
    jobs = _load_jobs(idir, df)

    # Do the work.
    pool = Pool(4)
    njobs = len(jobs)

    try:
        import sys
        with codecs.open(ofile, 'wb') as pf:
            pickle.dump(njobs, pf)
            results = pool.imap_unordered(worker, jobs)
            for i, result in enumerate(results, 1):
                pickle.dump(result, pf)
                per = 100 * (float(i) / njobs)
                sys.stdout.write(u'\rPercent Complete: {:2.3f}%'.format(per))
                sys.stdout.flush()
            sys.stdout.write(u'\rPercent Complete: 100%    \n')
            sys.stdout.flush()

    except KeyboardInterrupt:
        sys.stdout.write(u'\rPercent Complete: {:2.3f}%    \n'.format(per))
        sys.stdout.write(u'Shutting down.\n')
        sys.stdout.flush()
        sys.exit()

    print u'Complete!'
Пример #9
0
def store_contents(data_path, save_path, preprocess, num_workers=None):
    """Preprocess and store a corpus of documents in sqlite.

    Args:
        data_path: Root path to directory (or directory of directories) of files
          containing json encoded documents (must have `id` and `text` fields).
        save_path: Path to output sqlite db.
        preprocess: Path to file defining a custom `preprocess` function. Takes
          in and outputs a structured doc.
        num_workers: Number of parallel processes to use when reading docs.
    """
    if os.path.isfile(save_path):
        raise RuntimeError('%s already exists! Not overwriting.' % save_path)

    logger.info('Reading into database...')
    conn = sqlite3.connect(save_path)
    c = conn.cursor()
    c.execute("CREATE TABLE documents (id PRIMARY KEY, text);")

    workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,))
    files = [f for f in iter_files(data_path)]
    count = 0
    with tqdm(total=len(files)) as pbar:
        for pairs in tqdm(workers.imap_unordered(get_contents, files)):
            count += len(pairs)
            c.executemany("INSERT INTO documents VALUES (?,?)", pairs)
            pbar.update()
    logger.info('Read %d docs.' % count)
    logger.info('Committing...')
    conn.commit()
    conn.close()
Пример #10
0
        def compress_file(self,corpus, np=4,separator=None):
                """
		construct WLZW pattern out of a corpus, parallelism is an option
		@param corpus - string, file path of the corpus
		@param np - number of processes, if np = 1 the algorithm is run in serial
		@param separator - the separator string to separate doc id and document. pass None if no doc id is given
		@return set, the final set containing all frequent patterns
		"""

                #if only one process, no need for parallelization
                if np==1:
                        return set(_compress_file((corpus,0,np,separator)))

                p=Pool(processes=np)
                l=[]
                for i in range(0,np):
                        l.append((corpus,i,np,separator))
                result=p.imap_unordered(_compress_file,l,1)

                if np==1:
                        final_set=result.next()
                else:
                        final_set=_union(result)

                return final_set
Пример #11
0
 def imap_unordered(self, func, iterable, chunksize=1):
     '''Same as SGEPool.imap, except that the results are unordered.
     Rather than blocking to ensure the correct order, all jobs are polled and
     results are returned as soon as they are done.
     '''
     if not self.use_grid_engine:
         workerPool = Pool(initializer=self.initializer, initargs=self.initargs)
         for val in workerPool.imap_unordered(func, iterable, chunksize):
             yield val
     iterable = iter(iterable)
     allJobs = self._submit_jobs(func, iterable, 'map', chunksize)
     interval = 3
     while len(allJobs) > 0:
         doneJobs = []
         for job in allJobs:
             if job.isFinished():
                 doneJobs.append(job)
                 for data in self._getData(job.outputFile):
                     yield data
                 os.remove(job.inputFile)      # BUG: these files aren't removed if there is an exception raised
                 os.remove(job.outputFile)
         for job in doneJobs:
             allJobs.remove(job)
         if len(doneJobs) == 0:
             # no jobs are done yet-- wait for a while for them to finish
             time.sleep(interval)
             interval = min( 2 * interval, .001 )
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='Analyze a bandersnatch mirror.')
    parser.add_argument('--json',
                       help='save raw data to a json file',
                       default=None)
    args = parser.parse_args()
    concurrency = 8
    root = "/var/spool/pypi/web/packages/source/"
    p = Pool()
    results = {}
    try:
        try:
            for path, result in \
                p.imap_unordered(analyse_sdist, yield_packages(root)):
                results[path] = result
            p.close()
        except:
            p.terminate()
            raise
    finally:
        p.join()
    if args.json:
        with open(args.json, 'wb') as f:
            f.write(json.dumps(results))
    pprint.pprint(results)
Пример #13
0
def run_committee(graph, eweights, signs, tree_kind='rst', train_vertices=.1,
                  size=13, degree_function=None, threshold_function=None):
    global GRAPH, EWEIGHTS, SIGNS, VTRAIN
    GRAPH, EWEIGHTS, SIGNS = graph, eweights, signs
    if isinstance(train_vertices, float):
        num_revealed = int(train_vertices*len(graph))
        train_vertices = random.sample(list(graph.keys()), num_revealed)
    VTRAIN = train_vertices
    tree_kind = tree_kind.lower()
    assert tree_kind in ['rst', 'bfs', 'stg'], tree_kind
    if tree_kind == 'rst':
        args = size*[(get_rst, {'fake': None}), ]
    if tree_kind == 'bfs':
        degrees = sorted(((node, len(adj)) for node, adj in graph.items()),
                         key=lambda x: x[1])
        args = [(get_bfs, {'root': _[0]}) for _ in degrees[-size:]]
    if tree_kind == 'stg':
        func_dict = {'degree_function': degree_function,
                     'threshold_function': threshold_function}
        args = size*[(get_stg, func_dict), ]
    num_threads = min(6, size)
    pool = Pool(num_threads)
    res = list(pool.imap_unordered(predict, args,
                                   chunksize=size//num_threads))
    preds, gold = [_[1] for _ in res], res[0][0]
    return gold, majority_vote(preds)
Пример #14
0
def create_database(f, db):
    global t0
    t0 = walltime()

    P = Pool(NUMPROCESSES)
    it = P.imap_unordered(process_data, raw_data(f), chunksize=100)

    con = connect(db)
    con.execute('''
        CREATE TABLE polygons(
            rowid INTEGER PRIMARY KEY,
            vertices TEXT,
            num_vertices INTEGER,
            volume REAL,
            num_points INTEGER,
            num_interior INTEGER,
            num_border INTEGER,
            width INTEGER,
            length INTEGER,
            symm INTEGER)
    ''')

    con.executemany('INSERT INTO polygons VALUES (?,?,?,?,?,?,?,?,?,?)', it)
    con.commit()
    con.close()
Пример #15
0
def all_links(root='http://ailev.livejournal.com/', nb=10, path='post-list.txt'):
    print('Fetch calendar entries')
    days = list_days(root_url=root)
    print('There are {} days with entries'.format(len(days)))
    t0 = time()

    pool = Pool(processes=nb)
    it = pool.imap_unordered(list_posts, days)
    work = list(tqdm(it, total=len(days)))
    pool.close()
    pool.join()

    links = []
    for x in work:
        if x:
            links.extend(x)
    # there may be duplicates, don't know why, so fast walkaround
    links = list(set(links))
    links.sort()

    with open(path, 'w') as fout:
        fout.writelines(x + '\n' for x in links)

    t1 = time()
    print('Done for {}s'.format(t1 - t0))
    return links
Пример #16
0
def normalize_all_words(l):
  r = dict()
  processes = Pool(max(1, cpu_count()-1))
  for s in l.values():
    for w, wn in processes.imap_unordered(normalize_word, s):
      r[w] = wn
  return r
Пример #17
0
def build_level_database(input_path):
    print "Generating or regenerating level database."
    print "This may take a long time if a lot of files need to be scanned."
    paths = [p.strip() for p in find(input_path)]

    levels = []
    pool = Pool(processes=4)

    ratio = 100.0 / len(paths)
    processed = 0
    last_percent = 0
    for data in pool.imap_unordered(process_level, paths):
        processed += 1
        percent = int(processed * ratio)
        if percent > last_percent:
            last_percent = percent
            print "... {}%".format(percent)
        
        if not data:
            continue
        levels.append(data)

    db = {
        "levels" : levels,
        "version" : DATABASE_VERSION,
    }

    with open("level_db.pickle", "w") as outfile:
        pickle.dump(db, outfile)
    def _read_multi(self, graphs, n_jobs, batch_size):
        """
        like read_single but with multiple processes
        """

        if n_jobs > 1:
            pool = Pool(processes=n_jobs)
        else:
            pool = Pool()

        # extract_c_and_i = lambda batch,args: [ extract_cores_and_interfaces(  [y]+args ) for y in batch ]

        results = pool.imap_unordered(extract_cips,
                                      self._multi_process_argbuilder(graphs, batch_size=batch_size))

        # the resulting chips can now be put intro the grammar
        jobs_done = 0
        for batch in results:
            for exci in batch:
                if exci:  # exci might be None because the grouper fills up with empty problems
                    for exci_result_per_node in exci:
                        for cip in exci_result_per_node:
                            self._add_core_interface_data(cip)
                jobs_done += 1
                if jobs_done == self.multiprocess_jobcount and self.mp_prepared:
                    pool.terminate()
        pool.close()
        pool.join()
Пример #19
0
def parallel_iter(processes, f, inputs):
    """
    Return a parallel iterator.

    INPUT:

    - ``processes`` -- integer
    - ``f`` -- function
    - ``inputs`` -- an iterable of pairs (args, kwds)

    OUTPUT:

    - iterator over values of ``f`` at ``args,kwds`` in some random order.

    EXAMPLES::

        sage: def f(x): return x+x
        sage: import sage.parallel.multiprocessing_sage
        sage: v = list(sage.parallel.multiprocessing_sage.parallel_iter(2, f, [((2,), {}), ((3,),{})]))
        sage: v.sort(); v
        [(((2,), {}), 4), (((3,), {}), 6)]
    """
    from twisted.internet import reactor   # do not delete this (!)  -- see trac 8785

    if processes == 0: processes = ncpus.ncpus()
    p = Pool(processes)
    fp = pickle_function(f)

    result = p.imap_unordered(call_pickled_function, [ (fp, t) for t in inputs ])
    for res in result:
        yield res
    p.close()
    p.join()
Пример #20
0
def main():
    p = Pool(200)
    mgr= Manager()
    with open('ip1024', 'r') as f:
        proxyDictList = ['proxy_user:wYFzbwTfpR@'+line.strip()+':17102' for line in f.readlines()]
    http = httpUtil(iplist=mgr.list(proxyDictList),key1='Price + Shipping')

    partial_merge = functools.partial(getData,http=http,mutex=mgr.Lock())

    with open(sys.argv[2], 'w') as file_out:
        for output, line in p.imap_unordered(partial_merge, open(sys.argv[1], "r+")):
            if(output==-2):
                file_out.write("%s\t%s\t%s\t%s\t%s\n" % (line.rstrip('\r\n'), 'missed2', 'N','N','-4'))
                continue
            if(output==-1):
                file_out.write("%s\t%s\t%s\t%s\t%s\n" % (line.rstrip('\r\n'), 'missed1', 'N','N','-4'))
                continue
            try:
                s=str("%s\t%s\t%s\t%s\t%s \n"  % \
                      (line.rstrip('\r\n'), output['price1'], output['price2'], output['shipping'], output['output']))
                file_out.write(s)
            except:
                file_out.write(line)

    p.close()
    p.join()
Пример #21
0
def run(asmb_fn, options):
    if multiproc_exception is None and options.cpus > 1:
        work_units = []
    asmb_input=IMP.multifit.read_settings(asmb_fn)
    asmb_input.set_was_used(True)
    em_map=asmb_input.get_assembly_header().get_dens_fn()
    resolution=asmb_input.get_assembly_header().get_resolution()
    spacing=asmb_input.get_assembly_header().get_spacing()
    origin=asmb_input.get_assembly_header().get_origin()
    for i in range(asmb_input.get_number_of_component_headers()):
        fits_fn=asmb_input.get_component_header(i).get_transformations_fn()
        pdb_fn=asmb_input.get_component_header(i).get_filename()
        f = Fitter(em_map, spacing, resolution, origin, asmb_input.get_assembly_header().get_threshold(),pdb_fn, fits_fn, options.angle,options.num,options.angle_voxel)
        if multiproc_exception is None and options.cpus > 1:
            work_units.append(f)
        else:
            if options.cpus > 1:
                options.cpus = 1
                print >> sys.stderr, """
The Python 'multiprocessing' module (available in Python 2.6 and later) is
needed to run on multiple CPUs, and could not be found
(Python error: '%s').
Running on a single processor.""" % multiproc_exception
            f.run()
    if multiproc_exception is None and options.cpus > 1:
        # No point in spawning more processes than components
        nproc = min(options.cpus, asmb_input.get_number_of_component_headers())
        p = Pool(processes=nproc)
        out = list(p.imap_unordered(do_work, work_units))
def main():
  total = len(sys.argv)

  if total < 3:
    print "Utilization: python apply_distance.py <shape_file> <shape_polygon_field> <input_csv_file> <output_csv_file>"
    exit(0)

  pool = Pool(processes=cpu_count())


  idata = read_input(str(sys.argv[1]),str(sys.argv[2]),str(sys.argv[3]))

  num_tasks = len(idata)  

  #imap
  responses = pool.imap_unordered(process_distance, idata)

  while (True):
    completed = responses._index
    if (completed == num_tasks): break
    percent = (float(completed)/float(num_tasks))*100
    print "%.3f" % percent," % complete. ", "Waiting for", num_tasks-completed, "tasks to complete..."
    time.sleep(2)


  pool.close()

  responses = [x for x in responses if x is not None]

  idata = write_to_csv(str(sys.argv[4]),responses)
def process_sessions_real(coordinators,
                          updates_directory,
                          index_filename,
                          pickle_root,
                          result_pickle_root,
                          num_workers=None):
    if num_workers != 0:
        pool = Pool(processes=num_workers)

    session_context_manager = SessionContextManager()
    session_context_manager.declare_persistent_state(
            'filenames_processed', set, None)
    session_context_manager.declare_persistent_state(
            'last_sequence_number_processed', return_negative_one, None)
    for coordinator in coordinators:
        for name, (init_func, merge_func) \
                in coordinator.persistent_state.iteritems():
            session_context_manager.declare_persistent_state(
                    name, init_func, merge_func)
        for name, (init_func, merge_func) \
                in coordinator.ephemeral_state.iteritems():
            session_context_manager.declare_ephemeral_state(
                    name, init_func, merge_func)

    print 'Preparing processors'
    process_args = []
    index = UpdatesIndex(index_filename)
    for session in index.sessions:
        processors = []
        for coordinator in coordinators:
            processors.append(coordinator.create_processor(session))
        update_files = index.session_data(session)
        process_args.append((session,
                             session_context_manager,
                             pickle_root,
                             result_pickle_root,
                             processors,
                             update_files,
                             updates_directory))

    print 'Processing sessions'
    global_context = GlobalContext()
    if num_workers == 0:
        for args in process_args:
            pickle_path = process_session_wrapper(args)
            session_context = session_context_manager.load_context(pickle_path)
            session_context_manager.merge_contexts(session_context, global_context)
            del session_context
    else:
        results = pool.imap_unordered(process_session_wrapper, process_args)
        for pickle_path in results:
            session_context = session_context_manager.load_context(pickle_path)
            session_context_manager.merge_contexts(session_context, global_context)
            del session_context
        pool.close()
        pool.join()

    print 'Post-processing'
    for coordinator in coordinators:
        coordinator.finished_processing(global_context)
Пример #24
0
def get_kmer_counts(input, output, k, ns, nprocs, verbose):
    """Analyse kmers. Multiprocessing enabled"""
    #define base2digit dict for 4-char seq
    base2digit = {"A": "0", "C": "1", "G": "2", "T": "3"}    
    if ns:
        #change to 5-char seq if Ns in seq
        base2digit = {"A": "0", "C": "1", "G": "2", "N": "3", "T": "4"}
    #init mer counts
    #255 for uint8 #65,535 for uint16 or #4,294,967,295 for uint32 
    merCounts = np.zeros(len(base2digit)**k/2, dtype='uint16')
    #start pool #maxtasksperchild=1000)
    p = Pool(nprocs, initializer=init_args, initargs=(k, ns, base2digit)) 
    #process reads
    for i, ids in enumerate(p.imap_unordered(seq2mers, SeqIO.parse(input, 'fastq'), \
                                             chunksize=100), 1):
        if not i%1e4:
            sys.stderr.write(" %s [%s Mb]\r"%(i, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024))
        for mid in ids:
            merCounts[mid] += 1
    sys.stderr.write(" %s [%s Mb]\n"%(i, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024))
    #get mer freq
    maxCount    = merCounts.max()
    if maxCount < 100:
        maxCount = 100
    occurencies = [0]*maxCount
    for c in merCounts:
        occurencies[c-1] += 1
    #write to file
    output.write("\n".join("%s\t%s"%xy for xy in enumerate(occurencies,1))+"\n")
    return occurencies
def initJobsDirs(jobs_dir, include_wt, displacement, debug):
    """ initializes the jobDirs variable """
    pdbToProtein = { pdbPath.split('/')[-1]: protein
                     for protein, pdbPath in pdbs.iteritems() }

    roots = list()
    filelists = list()
    for root, dirs, files in os.walk(jobs_dir, followlinks=True):
        roots.append(root)
        filelists.append(files)
        # checkJobsDir(pdbToProtein, root, files, include_wt, displacement, debug)

    doers = Pool(cpu_count())
    jobs = zip([pdbToProtein]*len(roots),
               roots,
               filelists,
               [include_wt]*len(roots),
               [displacement]*len(roots),
               [debug]*len(roots))

    print("Loading {} evaluators...{}".format(len(roots),
                                              datetime.datetime.now()))
    sys.stdout.flush()
    with click.progressbar(doers.imap_unordered(checkJobsDir, jobs),
                           length=len(roots), label='Running',
                           file=sys.stderr) as progbar:
        for j in progbar:
            pass
    print("Done...{}".format(datetime.datetime.now()))
Пример #26
0
def main(argv):
    """Go Main Go."""
    scenario = int(argv[1])
    lengths = load_lengths(scenario)
    dates = determine_dates(sys.argv)
    huc12s = find_huc12s(scenario)
    precip = load_precip(dates)
    jobs = []
    for huc12 in huc12s:
        jobs.append([scenario, huc12, lengths[huc12], dates, precip[huc12]])

    # Begin the processing work now!
    # NB: Usage of a ThreadPool here ended in tears (so slow)
    pool = Pool()
    totalinserts = 0
    totalskipped = 0
    totaldeleted = 0
    for huc12, inserts, skipped, deleted in tqdm(
            pool.imap_unordered(do_huc12, jobs), total=len(jobs),
            disable=(not sys.stdout.isatty())):
        if inserts is None:
            print("ERROR: huc12 %s returned 0 data" % (huc12,))
            continue
        totalinserts += inserts
        totalskipped += skipped
        totaldeleted += deleted
    print("env2database.py inserts: %s skips: %s deleted: %s" % (totalinserts,
                                                                 totalskipped,
                                                                 totaldeleted))
    update_metadata(scenario, dates)
Пример #27
0
def subconfigure(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--list', type=str,
        help='File containing a list of subconfigures to run')
    parser.add_argument('--skip', type=str,
        help='File containing a list of Subconfigures to skip')
    parser.add_argument('subconfigures', type=str, nargs='*',
        help='Subconfigures to run if no list file is given')
    args, others = parser.parse_known_args(args)
    subconfigures = args.subconfigures
    if args.list:
        subconfigures.extend(open(args.list, 'rb').read().splitlines())
    if args.skip:
        skips = set(open(args.skip, 'rb').read().splitlines())
        subconfigures = [s for s in subconfigures if s not in skips]

    if not subconfigures:
        return 0

    ret = 0
    # One would think using a ThreadPool would be faster, considering
    # everything happens in subprocesses anyways, but no, it's actually
    # slower on Windows. (20s difference overall!)
    pool = Pool(min(len(subconfigures), cpu_count()))
    for relobjdir, returncode, output in \
            pool.imap_unordered(run, subconfigures):
        print prefix_lines(output, relobjdir)
        sys.stdout.flush()
        ret = max(returncode, ret)
        if ret:
            break
    pool.close()
    pool.join()
    return ret
Пример #28
0
    def func(n_cores = 1, **kwargs):
        """
        Function for command line action

        **Arguments:**
            :*n_cores*: Number of cores to use
        """
        from multiprocessing import Pool

        block_generator   = block_generator_class(**kwargs)
        block_accumulator = block_accumulator_class(
          preexisting_slice = block_generator.preexisting_slice,
          incoming_slice    = block_generator.incoming_slice,
          outputs           = block_generator.outputs,
          **kwargs)

        if n_cores == 1:                # Serial
            for block in block_generator:
                block()
                block_accumulator.send(block)
        else:                           # Parallel (processes)
            pool = Pool(n_cores)
            for block in pool.imap_unordered(pool_director, block_generator):
                pass
                block_accumulator.send(block)
            pool.close()
            pool.join()

        block_accumulator.close()

        block_acceptor = Block_Acceptor(outputs = block_accumulator.outputs,
                           **kwargs)
        block_acceptor.send(block_accumulator)
        block_acceptor.close()
Пример #29
0
 def __iter__(self):
     '''Return OcgCollection objects from the cache or directly from
     source data.
     
     yields
     
     OcgCollection'''
     
     ## simple iterator for serial operations
     if self.serial:
         it = itertools.imap(get_collection,self._iter_proc_args_())
     ## use a multiprocessing pool returning unordered geometries
     ## for the parallel case
     else:
         pool = Pool(processes=self.nprocs)
         it = pool.imap_unordered(get_collection,
                                  self._iter_proc_args_())
     ## the iterator return from the Pool requires calling its 'next'
     ## method and catching the StopIteration exception
     while True:
         try:
             yld = it.next()
             yield(yld)
         except StopIteration:
             break
    def cluster_estimator_similarity(_c_trace):

        from multiprocessing import Pool

        pool = Pool(processes=6)

        # pack = number_of_samples
        # dim = _c_trace.shape
        # incr = int(dim[0]/pack)

        # number_of_samples =

        bins = range(0, number_of_samples, pairwise_number_in_pack)

        if bins[-1] <> number_of_samples:
            bins.append(number_of_samples)

        a = [_c_trace[bins[i] : bins[i + 1]] for i in range(len(bins[:-1]))]

        import pararell_methods

        start = time.time()

        total_matrix = sum(pool.imap_unordered(pararell_methods.pararell_calc_ne, a))
        pool.close()
        pool.join()
        end = time.time()
        print end - start

        return total_matrix
def calculateCorrelationforOntology(aspect, matrix_type):
    print("\n\nSemantic similarity correlation calculation for aspect:" +
          aspect + " using matrix:" + matrix_type + " ...\n")
    #Clear lists before each aspect
    similarity_list[:] = []
    proteinListNew[:] = []

    similarityMatrixNameDict = {}
    similarityMatrixNameDict[
        "All"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv"
    similarityMatrixNameDict[
        "500"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"
    similarityMatrixNameDict[
        "Sparse"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv"
    similarityMatrixNameDict[
        "200"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv"

    similarityMatrixFileName = similarityMatrixNameDict[matrix_type]

    human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName)
    human_proteinSimilarityMatrix.set_index(
        human_proteinSimilarityMatrix.columns, inplace=True)
    proteinList = human_proteinSimilarityMatrix.columns

    #proteinListNew is referanced using Manager
    for prot in proteinList:
        proteinListNew.append(prot)
    if matrix_type == "Sparse":
        #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy")
        sparsified_similarity_coordinates = np.load(
            "../data/auxilary_input/SparsifiedSimilarityCoordinates_" +
            aspect + "_for_highest_500.npy")
        protParamList = sparsified_similarity_coordinates
    else:
        i = range(len(proteinList))
        j = range(len(proteinList))
        protParamList = list(itertools.product(i, j))
    protParamListNew = []
    # Prepare parameters for parallel processing these parameters will be
    # used concurrently by different processes
    for tup in tqdm(protParamList):
        i = tup[0]
        j = tup[1]

        if matrix_type == "Sparse":
            protein1 = proteinListNew[i]
            protein2 = proteinListNew[j]
            real = human_proteinSimilarityMatrix.loc[protein1, protein2]
            tupNew = (tup[0], tup[1], aspect, real)
            protParamListNew.append(tupNew)
        else:
            if j > i:
                protein1 = proteinListNew[i]
                protein2 = proteinListNew[j]
                real = human_proteinSimilarityMatrix.loc[protein1, protein2]
                tupNew = (tup[0], tup[1], aspect, real)
                protParamListNew.append(tupNew)

    total_task_num = len(protParamListNew)
    pool = Pool()
    similarity_listRet = []
    #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type)
    for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity,
                                                       protParamListNew),
                                   total=total_task_num,
                                   position=0,
                                   leave=True):
        pass
        #time.sleep(0.1)
    pool.close()
    pool.join()

    real_distance_list = [value[0] for value in similarity_listRet]
    cosine_distance_list = [value[1] for value in similarity_listRet]
    manhattan_distance_list = [value[2] for value in similarity_listRet]
    euclidian_distance_list = [value[3] for value in similarity_listRet]

    distance_lists = [
        real_distance_list, cosine_distance_list, manhattan_distance_list,
        euclidian_distance_list
    ]
    if detailed_output:
        report_detailed_distance_scores(representation_name, matrix_type,
                                        aspect, distance_lists)

    cosineCorr = spearmanr(real_distance_list, cosine_distance_list)
    manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list)
    euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list)

    #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr))
    #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr))
    #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr))

    return (cosineCorr, manhattanCorr, euclidianCorr)
Пример #32
0
        'ngram_range': [(1, 2)]
    }
    models = {}
    if args.nproc == 1:
        for idx, ambig_terms_batch in \
                enumerate(batch_iter(all_ambigs_pmids, 10)):
            pickle_name = 'gilda_ambiguities_hgnc_mesh_%d.pkl' % idx
            models = learn_batch(ambig_terms_batch)
            with open(pickle_name, 'wb') as fh:
                pickle.dump(models, fh)
    else:
        pool = Pool(args.nproc)
        fun = functools.partial(learn_model, params=param_grid)
        pkl_idx = 0
        models = {}
        for count, model in enumerate(
                pool.imap_unordered(fun, all_ambigs_pmids, chunksize=10)):
            print('#### %d ####' % count)
            if model is None:
                print('Model is None, skipping')
            else:
                models[model['ambig'][0].text] = model
            if (count + 1) % 100 == 0:
                pickle_name = 'gilda_ambiguities_hgnc_mesh_%d.pkl' % pkl_idx
                with open(pickle_name, 'wb') as fh:
                    pickle.dump(models, fh)
                pkl_idx += 1
                models = {}
        pool.close()
        pool.join()
Пример #33
0
def main():
    args = get_parser().parse_args()

    # TODO convert to logging

    sys.stderr.write("* Initializing reads file search.\n")
    fast5_reads = fast5utils.iterate_fast5_reads(
        args.input_folder,
        limit=args.limit,
        strand_list=args.input_strand_list,
        recursive=args.recursive)

    if args.scaling is not None:
        sys.stderr.write("* Loading read scaling parameters from {}.\n".format(
            args.scaling))
        all_read_params = get_per_read_params_dict_from_tsv(args.scaling)
        input_read_ids = frozenset(rec[1] for rec in fast5_reads)
        scaling_read_ids = frozenset(all_read_params.keys())
        sys.stderr.write("* {} / {} reads have scaling information.\n".format(
            len(input_read_ids & scaling_read_ids), len(input_read_ids)))
        fast5_reads = [
            rec for rec in fast5_reads if rec[1] in scaling_read_ids
        ]
    else:
        all_read_params = {}

    sys.stderr.write("* Calling reads.\n")
    nbase, ncalled, nread, nsample = 0, 0, 0, 0
    t0 = time.time()
    progress = Progress(quiet=args.quiet)
    startcharacter = '@' if args.fastq else '>'
    initargs = [
        args.device, args.model, args.chunk_size, args.overlap,
        all_read_params, args.alphabet, args.max_concurrent_chunks, args.fastq,
        args.qscore_scale, args.qscore_offset, args.beam, args.posterior,
        args.temperature
    ]
    pool = Pool(args.jobs, initializer=worker_init, initargs=initargs)
    with open_file_or_stdout(args.output) as fh:
        for read_id, basecall, qstring, read_nsample in \
                pool.imap_unordered(worker, fast5_reads):
            if basecall is not None and len(basecall) > 0:
                fh.write("{}{}\n{}\n".format(
                    startcharacter, read_id,
                    basecall[::-1] if args.reverse else basecall))
                nbase += len(basecall)
                ncalled += 1
                if args.fastq:
                    fh.write("+\n{}\n".format(
                        qstring[::-1] if args.reverse else qstring))

            nread += 1
            nsample += read_nsample
            progress.step()
    total_time = time.time() - t0

    sys.stderr.write("* Called {} reads in {:.2f}s\n".format(
        nread, int(total_time)))
    sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time /
                                                    1000.0))
    sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time /
                                                      1000.0))
    sys.stderr.write("* {} reads failed.\n".format(nread - ncalled))

    #quantized_model(model)
    return
Пример #34
0
def main():
    if args.perl:
        eval_fn_list = list(glob.glob(args.pred))
    else:
        eval_fn_list = [
            eval_fn for eval_fn in glob.glob(args.pred)
            if not (args.lazy_eval and Path(eval_fn + ".rouge").exists())
        ]
    eval_fn_list = list(
        filter(lambda fn: not (fn.endswith('.post') or fn.endswith('.rouge')),
               eval_fn_list))

    if args.only_eval_best:
        best_epoch_dict = {}
        for dir_path in set(Path(fn).parent for fn in eval_fn_list):
            fn_save = os.path.join(dir_path, 'save_best.dev')
            if Path(fn_save).exists():
                with open(fn_save, 'r') as f_in:
                    __, o_name, __ = f_in.read().strip().split('\n')
                    epoch = o_name.split('.')[1]
                    best_epoch_dict[dir_path] = epoch
        new_eval_fn_list = []
        for fn in eval_fn_list:
            dir_path = Path(fn).parent
            if dir_path in best_epoch_dict:
                if Path(fn).name.split('.')[1] == best_epoch_dict[dir_path]:
                    new_eval_fn_list.append(fn)
        eval_fn_list = new_eval_fn_list

    logger.info("***** Evaluation: %s *****", ','.join(eval_fn_list))
    num_pool = max(1, min(args.processes, len(eval_fn_list)))
    logger.info(args.processes, len(eval_fn_list), num_pool)
    p = Pool(num_pool)
    r_list = p.imap_unordered(process_eval, eval_fn_list)
    r_list = sorted([(fn, scores) for fn, scores in r_list],
                    key=lambda x: x[0])
    rg2_dict = {}
    for fn, scores in r_list:
        logger.info(fn)
        if args.perl:
            print(rouge_results_to_str(scores))
        else:
            rg2_dict[fn] = scores['rouge-2']['f']
            print("ROUGE-1: {}\tROUGE-2: {}\tROUGE-L: {}\n".format(
                scores['rouge-1']['f'], scores['rouge-2']['f'],
                scores['rouge-l']['f']))
            with open(fn + ".rouge", 'w') as f_out:
                f_out.write(
                    json.dumps({
                        'rg1': scores['rouge-1']['f'],
                        'rg2': scores['rouge-2']['f']
                    }))
    p.close()
    p.join()

    if args.save_best:
        # find best results
        group_dict = {}
        for k, v in rg2_dict.items():
            d_name, o_name = Path(k).parent, Path(k).name
            if (d_name not in group_dict) or (v > group_dict[d_name][1]):
                group_dict[d_name] = (o_name, v)
        # compare and save the best result
        for k, v in group_dict.items():
            fn = os.path.join(k, 'save_best.' + args.split)
            o_name_s, rst_s = v
            should_save = True
            if Path(fn).exists():
                with open(fn, 'r') as f_in:
                    rst_f = float(f_in.read().strip().split('\n')[-1])
                if rst_s <= rst_f:
                    should_save = False
            if should_save:
                with open(fn, 'w') as f_out:
                    f_out.write('{0}\n{1}\n{2}\n'.format(k, o_name_s, rst_s))
Пример #35
0
    def update(self, tree, parallel=True):
        # type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool
        """Update the manifest given an iterable of items that make up the updated manifest.

        The iterable must either generate tuples of the form (SourceFile, True) for paths
        that are to be updated, or (path, False) for items that are not to be updated. This
        unusual API is designed as an optimistaion meaning that SourceFile items need not be
        constructed in the case we are not updating a path, but the absence of an item from
        the iterator may be used to remove defunct entries from the manifest."""

        logger = get_logger()

        changed = False

        # Create local variable references to these dicts so we avoid the
        # attribute access in the hot loop below
        data = self._data

        types = data.type_by_path()
        remaining_manifest_paths = set(types)

        to_update = []

        for path, file_hash, updated in tree:
            path_parts = tuple(path.split(os.path.sep))
            is_new = path_parts not in remaining_manifest_paths

            if not updated and is_new:
                # This is kind of a bandaid; if we ended up here the cache
                # was invalid but we've been using it anyway. That's obviously
                # bad; we should fix the underlying issue that we sometimes
                # use an invalid cache. But at least this fixes the immediate
                # problem
                raise InvalidCacheError

            if not updated:
                remaining_manifest_paths.remove(path_parts)
            else:
                assert self.tests_root is not None
                source_file = SourceFile(self.tests_root,
                                         path,
                                         self.url_base,
                                         file_hash)

                hash_changed = False  # type: bool

                if not is_new:
                    if file_hash is None:
                        file_hash = source_file.hash
                    remaining_manifest_paths.remove(path_parts)
                    old_type = types[path_parts]
                    old_hash = data[old_type].hashes[path_parts]
                    if old_hash != file_hash:
                        hash_changed = True
                        del data[old_type][path_parts]

                if is_new or hash_changed:
                    to_update.append(source_file)

        if to_update:
            logger.debug("Computing manifest update for %s items" % len(to_update))
            changed = True


        # 25 items was derived experimentally (2020-01) to be approximately the
        # point at which it is quicker to create a Pool and parallelize update.
        pool = None
        if parallel and len(to_update) > 25 and cpu_count() > 1:
            # On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes
            # causes a crash in the multiprocessing module. Whilst this enum
            # can technically have any value, it is usually 64. For safety,
            # restrict manifest regeneration to 48 processes on Windows.
            #
            # See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263
            processes = cpu_count()
            if sys.platform == "win32" and processes > 48:
                processes = 48
            pool = Pool(processes)

            # chunksize set > 1 when more than 10000 tests, because
            # chunking is a net-gain once we get to very large numbers
            # of items (again, experimentally, 2020-01)
            chunksize = max(1, len(to_update) // 10000)
            logger.debug("Doing a multiprocessed update. CPU count: %s, "
                "processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize))
            results = pool.imap_unordered(compute_manifest_items,
                                          to_update,
                                          chunksize=chunksize
                                          )  # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]]
        else:
            results = map(compute_manifest_items, to_update)

        for result in results:
            rel_path_parts, new_type, manifest_items, file_hash = result
            data[new_type][rel_path_parts] = manifest_items
            data[new_type].hashes[rel_path_parts] = file_hash

        # Make sure to terminate the Pool, to avoid hangs on Python 3.
        # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
        if pool is not None:
            pool.terminate()

        if remaining_manifest_paths:
            changed = True
            for rel_path_parts in remaining_manifest_paths:
                for test_data in data.values():
                    if rel_path_parts in test_data:
                        del test_data[rel_path_parts]

        return changed
Пример #36
0
class BatchRunnerMP(BatchRunner):
    """ Child class of BatchRunner, extended with multiprocessing support. """

    def __init__(self, model_cls, nr_processes=None, **kwargs):
        """ Create a new BatchRunnerMP for a given model with the given
        parameters.

        model_cls: The class of model to batch-run.
        nr_processes: int
                      the number of separate processes the BatchRunner
                      should start, all running in parallel.
        kwargs: the kwargs required for the parent BatchRunner class
        """
        if nr_processes is None:
            # identify the number of processors available on users machine
            available_processors = cpu_count()
            self.processes = available_processors
            print("BatchRunner MP will use {} processors.".format(self.processes))
        else:
            self.processes = nr_processes

        super().__init__(model_cls, **kwargs)
        self.pool = Pool(self.processes)

    def _make_model_args_mp(self):
        """Prepare all combinations of parameter values for `run_all`
        Due to multiprocessing requirements of @StaticMethod takes different input, hence the similar function
        Returns:
            List of list with the form:
            [[model_object, dictionary_of_kwargs, max_steps, iterations]]
        """
        total_iterations = self.iterations
        all_kwargs = []

        count = len(self.parameters_list)
        if count:
            for params in self.parameters_list:
                kwargs = params.copy()
                kwargs.update(self.fixed_parameters)
                # run each iterations specific number of times
                for iter in range(self.iterations):
                    kwargs_repeated = kwargs.copy()
                    all_kwargs.append([self.model_cls, kwargs_repeated, self.max_steps, iter])

        elif len(self.fixed_parameters):
            count = 1
            kwargs = self.fixed_parameters.copy()
            all_kwargs.append(kwargs)

        total_iterations *= count

        return all_kwargs, total_iterations

    @staticmethod
    def _run_wrappermp(iter_args):
        """
        Based on requirement of Python multiprocessing requires @staticmethod decorator;
        this is primarily to ensure functionality on Windows OS and does not impact MAC or Linux distros

        :param iter_args: List of arguments for model run
            iter_args[0] = model object
            iter_args[1] = key word arguments needed for model object
            iter_args[2] = maximum number of steps for model
            iter_args[3] = number of time to run model for stochastic/random variation with same parameters
        :return:
            tuple of param values which serves as a unique key for model results
            model object
        """

        model_i = iter_args[0]
        kwargs = iter_args[1]
        max_steps = iter_args[2]
        iteration = iter_args[3]

        # instantiate version of model with correct parameters
        model = model_i(**kwargs)
        while model.running and model.schedule.steps < max_steps:
            model.step()

        # add iteration number to dictionary to make unique_key
        kwargs["iteration"] = iteration

        # convert kwargs dict to tuple to  make consistent
        param_values = tuple(kwargs.values())

        return param_values, model

    def _result_prep_mp(self, results):
        """
        Helper Function
        :param results: Takes results dictionary from Processpool and single processor debug run and fixes format to
        make compatible with BatchRunner Output
        :updates model_vars and agents_vars so consistent across all batchrunner
        """
        # Take results and convert to dictionary so dataframe can be called
        for model_key, model in results.items():
            if self.model_reporters:
                self.model_vars[model_key] = self.collect_model_vars(model)
            if self.agent_reporters:
                agent_vars = self.collect_agent_vars(model)
                for agent_id, reports in agent_vars.items():
                    agent_key = model_key + (agent_id,)
                    self.agent_vars[agent_key] = reports
            if hasattr(model, "datacollector"):
                if model.datacollector.model_reporters is not None:
                    self.datacollector_model_reporters[model_key] = model.datacollector.get_model_vars_dataframe()
                if model.datacollector.agent_reporters is not None:
                    self.datacollector_agent_reporters[model_key] = model.datacollector.get_agent_vars_dataframe()

        # Make results consistent
        if len(self.datacollector_model_reporters.keys()) == 0:
            self.datacollector_model_reporters = None
        if len(self.datacollector_agent_reporters.keys()) == 0:
            self.datacollector_agent_reporters = None

    def run_all(self):
        """
        Run the model at all parameter combinations and store results,
        overrides run_all from BatchRunner.
        """

        run_iter_args, total_iterations = self._make_model_args_mp()
        # register the process pool and init a queue
        # store results in ordered dictionary
        results = {}

        if self.processes > 1:
            with tqdm(total_iterations, disable=not self.display_progress) as pbar:
                for params, model in self.pool.imap_unordered(self._run_wrappermp, run_iter_args):
                    results[params] = model
                    pbar.update()

                self._result_prep_mp(results)
        # For debugging model due to difficulty of getting errors during multiprocessing
        else:
            for run in run_iter_args:
                params, model_data = self._run_wrappermp(run)
                results[params] = model_data

            self._result_prep_mp(results)

        # Close multi-processing
        self.pool.close()

        return (getattr(self, "model_vars", None), getattr(self, "agent_vars", None),
                getattr(self, "datacollector_model_reporters", None),
                getattr(self, "datacollector_agent_reporters", None))
Пример #37
0
def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[],
                                   alpha=0.5, method="OTD",
                                   base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000):
    """
    Compute Ricci curvature for edges in  given edge lists.

    :param G: A NetworkX graph.
    :param weight: The edge weight used to compute Ricci curvature. Default: "weight".
    :param edge_list: The list of edges to compute Ricci curvature, set to [] to run for all edges in G. Default: [].
    :param alpha: The parameter for the discrete Ricci curvature, range from 0 ~ 1.
                    It means the share of mass to leave on the original node.
                    E.g. x -> y, alpha = 0.4 means 0.4 for x, 0.6 to evenly spread to x's nbr.
                    Default: 0.5.
    :param method: Transportation method, "OTD" for Optimal Transportation Distance (Default),
                                          "ATD" for Average Transportation Distance.
                                          "Sinkhorn" for OTD approximated Sinkhorn distance.
    :param base: Base variable for weight distribution. Default: math.e.
    :param exp_power: Exponential power for weight distribution. Default: 0.
    :param proc: Number of processor used for multiprocessing.
    :param chunksize: Chunk size for multiprocessing, set None for auto decide. Default: None.
    :param cache_maxsize: Max size for LRU cache for pairwise shortest path computation.
                            Set this to None for unlimited cache. Default: 1000000.

    :return: output: A dictionary of edge Ricci curvature. E.g.: {(node1, node2): ricciCurvature}.
    """

    if not nx.get_edge_attributes(G, weight):
        print('Edge weight not detected in graph, use "weight" as default edge weight.')
        for (v1, v2) in G.edges():
            G[v1][v2][weight] = 1.0

    # ---set to global variable for multiprocessing used.---
    global _Gk
    global _alpha
    global _weight
    global _method
    global _base
    global _exp_power
    global _proc
    global _cache_maxsize
    # -------------------------------------------------------

    _Gk = nk.nxadapter.nx2nk(G, weightAttr=weight)
    _alpha = alpha
    _weight = weight
    _method = method
    _base = base
    _exp_power = exp_power
    _proc = proc
    _cache_maxsize = cache_maxsize

    # Construct nx to nk dictionary
    nx2nk_ndict, nk2nx_ndict = {}, {}
    for idx, n in enumerate(G.nodes()):
        nx2nk_ndict[n] = idx
        nk2nx_ndict[idx] = n

    if edge_list:
        args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in edge_list]
    else:
        args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in G.edges()]

    # Start compute edge Ricci curvature
    t0 = time.time()

    p = Pool(processes=_proc)

    # Decide chunksize following method in map_async
    if chunksize is None:
        chunksize, extra = divmod(len(args), proc * 4)
        if extra:
            chunksize += 1

    # Compute Ricci curvature for edges
    result = p.imap_unordered(_wrap_compute_single_edge, args, chunksize=chunksize)
    p.close()
    p.join()

    # Convert edge index from nk back to nx for final output
    output = {}
    for rc in result:
        for k in list(rc.keys()):
            output[(nk2nx_ndict[k[0]], nk2nx_ndict[k[1]])] = rc[k]

    logger.info("%8f secs for Ricci curvature computation." % (time.time() - t0))

    return output
Пример #38
0
def downloadReport(i, remove=False):
    '''
    用于下载2017年企业社会责任报告,下载格式为 pdf,传入参数为第 i 条企业
    '''
    filename = r"./pdf/{}{}.pdf".format(dfDownload["code"].iloc[i],
                                        dfDownload["title"].iloc[i])
    if os.path.exists(filename):
        if remove:
            os.remove(filename)
            print("{} removed".format(filename))
        else:
            print("{} exists".format(filename))
            return None
    response = getWeb(dfDownload["reportDownload"].iloc[i],
                      proxies=proxies,
                      Return="",
                      sleep=True,
                      sleepMultiply=3)
    with open(filename, "wb") as f:
        f.write(response.content)
    print("{} 成功获取".format(filename))
    return response


from multiprocessing import Pool

pool = Pool()
res = pool.imap_unordered(downloadReport, range(dfDownload.shape[0]))
resultPDF = [item for item in res]
pool.close()
Пример #39
0
def db_MapDB(params):
    params = utils.load_paramDict(params)
    params['dbtype'] = params.get('dbtype', 'bowtie2')
    db_columns = [
        c for c in params['db_columns'] + params['metadata_columns'] +
        params['taxa_columns'] if c not in ('sha256')
    ]

    assert params.get('seqlist', None) is not None, 'seqlist is required. '

    data = utils.load_database(**params)

    if params['seqlist'] in ('stdin', '-', ''):
        fin = sys.stdin
    else:
        fin = open(params['seqlist'])
    glist = pd.read_csv(fin, delimiter='\t', dtype='str')
    fin.close()

    mapdb = params['MapDB']
    mapdb = os.path.join(params['bowtie_db'], mapdb)
    start_id = 0

    indices = {i: 1 for i in glist['index'].tolist()}

    if len(glob.glob(mapdb + '.*')) > 0:
        assert params.get('mode', '') in (
            'overwrite', 'append'
        ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".'
        if params.get('mode', '') == 'overwrite':
            for fname in glob.glob(mapdb + '.*'):
                os.unlink(fname)
        elif params.get('mode', '') == 'append':
            for fname in glob.glob(mapdb + '.*.taxa.gz'):
                i = int(fname.rsplit('.', 3)[1])
                if i >= start_id:
                    start_id = i + 1
                with gzip.open(fname) as fin:
                    for line in fin:
                        indices[line.strip().split()[1]] = 2
    data = data.set_index('index', drop=False)
    data['size'] = data['size'].astype(int)
    data = data.loc[[i for i, t in indices.iteritems()
                     if t == 1]].sort_values(by=['size'], ascending=[False])
    min_file_num = int(np.ceil(
        np.sum(data['size']).astype(float) / 3800000000))

    buckets = [[0, []] for n in xrange(min_file_num)]
    id = -1
    for index, size, file_path, url_path in data[[
            'index', 'size', 'file_path', 'url_path'
    ]].as_matrix():
        size, done = int(size), 0
        for id in range(id + 1, len(buckets)) + range(id + 1):
            b = buckets[id]
            if b[0] + size <= 3800000000:
                b[0] += size
                b[1].append([index, size, file_path, url_path])
                done = 1
                break
        if done == 0:
            buckets.append([size, [[index, size, file_path, url_path]]])
    if params['dbtype'] == 'bowtie2':
        pool = Pool(min(params['n_thread'], len(buckets)))
        result = pool.imap_unordered(create_db, [[
            params['bowtie2_build'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
    else:
        result = map(create_db, [[
            params['malt_build'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
    for r in result:
        if r[2] != 0:
            print 'Database {0}.{1} FAILED with code {2}!'.format(*r)

    with open(mapdb + '.info', 'w') as fout:
        for id, bucket in enumerate(buckets):
            for b, _, _, _ in bucket[1]:
                fout.write('{0}\t{1}\n'.format(b, id + start_id))
    print 'Done'

    if __name__ == '__main__':
        db_MapDB(
            dict([[k.strip() for k in arg.split('=', 1)]
                  for arg in sys.argv[1:]]))
Пример #40
0
    temp = temp[temp.end_sta_lon.apply(lambda x: npy.ceil(x) in [-72,-73,-74,-75])]
    
    temp.bike_id.fillna('0')
    temp.user = temp.user.fillna(method='bfill')
    #Missing birth fields with 0
    temp.birth = temp.birth.fillna(0)
    temp.gender = temp.gender.fillna(method='ffill')
    temp.dropna(axis=0)
    
    #Write output to a file
    file_name = str(file).split('.')[0]
    temp.to_csv(path+file_name+'_cleaned.csv',encoding=ftype,header=True,index=False)
    


if __name__ == '__main__':
    
    f = open('C:/Users/Naveen/Downloads/Springboard/GitHub/new_york_citibikes/data/rides/extracted.json','r')
    file_names = json.load(f)
    f.close()
    names = list(file_names.keys())
    
    #Multiprocessing
    t = time.time()
    p = Pool()
    #Display progress
    for i, _ in enumerate(p.imap_unordered(clean, names), 1):
        sys.stderr.write('\rdone {0:%}'.format(i/len(names)))
    p.close()
    p.join()
    print("Completed in.....", time.time()-t)
Пример #41
0
def preprocess():
    """Run preprocessing process and compute statistics for normalizing."""
    config = parse_and_config()

    dataset_processor = {
        "ljspeech": LJSpeechProcessor,
        "kss": KSSProcessor,
        "libritts": LibriTTSProcessor,
        "baker": BakerProcessor,
        "thorsten": ThorstenProcessor,
    }

    dataset_symbol = {
        "ljspeech": LJSPEECH_SYMBOLS,
        "kss": KSS_SYMBOLS,
        "libritts": LIBRITTS_SYMBOLS,
        "baker": BAKER_SYMBOLS,
        "thorsten": THORSTEN_SYMBOLS,
    }

    dataset_cleaner = {
        "ljspeech": "english_cleaners",
        "kss": "korean_cleaners",
        "libritts": None,
        "baker": None,
        "thorsten": "german_cleaners",
    }

    logging.info(f"Selected '{config['dataset']}' processor.")
    processor = dataset_processor[config["dataset"]](
        config["rootdir"],
        symbols=dataset_symbol[config["dataset"]],
        cleaner_names=dataset_cleaner[config["dataset"]],
    )

    # check output directories
    build_dir = lambda x: [
        os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True)
        for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"]
    ]
    build_dir("train")
    build_dir("valid")

    # save pretrained-processor to feature dir
    processor._save_mapper(
        os.path.join(config["outdir"], f"{config['dataset']}_mapper.json"),
        extra_attrs_to_save={"pinyin_dict": processor.pinyin_dict}
        if config["dataset"] == "baker" else {},
    )

    # build train test split
    if config["dataset"] == "libritts":
        train_split, valid_split, _, _ = train_test_split(
            processor.items,
            [i[-1] for i in processor.items],
            test_size=config["test_size"],
            random_state=42,
            shuffle=True,
        )
    else:
        train_split, valid_split = train_test_split(
            processor.items,
            test_size=config["test_size"],
            random_state=42,
            shuffle=True,
        )
    logging.info(f"Training items: {len(train_split)}")
    logging.info(f"Validation items: {len(valid_split)}")

    get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0]
    train_utt_ids = [get_utt_id(x) for x in train_split]
    valid_utt_ids = [get_utt_id(x) for x in valid_split]

    # save train and valid utt_ids to track later
    np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids)
    np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids)

    # define map iterator
    def iterator_data(items_list):
        for item in items_list:
            yield processor.get_one_sample(item)

    train_iterator_data = iterator_data(train_split)
    valid_iterator_data = iterator_data(valid_split)

    p = Pool(config["n_cpus"])

    # preprocess train files and get statistics for normalizing
    partial_fn = partial(gen_audio_features, config=config)
    train_map = p.imap_unordered(
        partial_fn,
        tqdm(train_iterator_data,
             total=len(train_split),
             desc="[Preprocessing train]"),
        chunksize=10,
    )
    # init scaler for multiple features
    scaler_mel = StandardScaler(copy=False)
    scaler_energy = StandardScaler(copy=False)
    scaler_f0 = StandardScaler(copy=False)

    id_to_remove = []
    for result, mel, energy, f0, features in train_map:
        if not result:
            id_to_remove.append(features["utt_id"])
            continue
        save_features_to_file(features, "train", config)
        # partial fitting of scalers
        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            id_to_remove.append(features["utt_id"])
            continue
        # partial fitting of scalers
        if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0:
            id_to_remove.append(features["utt_id"])
            continue
        scaler_mel.partial_fit(mel)
        scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1))
        scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1))

    if len(id_to_remove) > 0:
        np.save(
            os.path.join(config["outdir"], "train_utt_ids.npy"),
            [i for i in train_utt_ids if i not in id_to_remove],
        )
        logging.info(
            f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction"
        )

    # save statistics to file
    logging.info("Saving computed statistics.")
    scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"),
                   (scaler_f0, "_f0")]
    save_statistics_to_file(scaler_list, config)

    # preprocess valid files
    partial_fn = partial(gen_audio_features, config=config)
    valid_map = p.imap_unordered(
        partial_fn,
        tqdm(valid_iterator_data,
             total=len(valid_split),
             desc="[Preprocessing valid]"),
        chunksize=10,
    )
    for *_, features in valid_map:
        save_features_to_file(features, "valid", config)
Пример #42
0
def create_length_distributions(db_path, cores, qnames=False, region=False):
    '''
	input: db_path - path to the database from hoobari's patch; cores - number of cores to use for
	multiprocessing; db_prefix - if hoobari's patch was ran for many different regions, it creates
	many DB's with the same prefix. since all those databases are required in this step, the prefix
	is also required.
	output: a tuple with two pandas dataframes that contain fragment length distributions which were
	calculated using *all* the databases
	'''

    # if the number of cores was specified - use it. if not, and there's more than 1 core, use all cores
    # except for one.
    # if cores:
    # 	pool = Pool(int(cores))
    # else:
    # 	if cpu_count() > 1:
    # 		pool = Pool(cpu_count() - 1)
    # 	else:
    # 		pool = Pool(1)
    pool = Pool(int(cores))

    db_path = os.path.abspath(db_path)
    if os.path.isfile(db_path):
        db_files = [db_path]
    elif os.path.isdir(db_path):
        if region:
            db_files_loc = db_path
            db_files = [
                os.path.join(db_files_loc, f) for f in os.listdir(db_files_loc)
                if f.endswith('.db')
            ]
        else:
            sys.exit(
                'If no database file is specified, a region must be specified using hoobari -r chrN:NNN-NNN'
            )
    else:
        sys.exit(
            'Please specify a database file or the dir or  using hoobari -d LOCATION \
			a region using hoobari -r chrN:NNN-NNN')

    # run the function get_fetal_and_shared_lengths for each path in db_files
    # pooled_results = pool.map(get_fetal_and_shared_lengths, db_files)
    get_qnames_and_alleles_with_args = partial(get_fetal_and_shared_lengths,
                                               qnames=qnames)
    # pooled_results = pool.map(get_qnames_and_alleles_with_args, db_files) #TODO: use pool.imap_unordered(func, iterable[, chunksize])
    # pool.close()
    # pool.join()

    # create two lists, one with all the shared fragments results, and one for the fetal fragments results

    con = db.Variants(db_files[0], probe=False)
    try:
        shared_lengths = con.getSharedLengths()
        fetal_lengths = con.getFetalLengths()
    except:
        sys.exit(str(db_path))

    if qnames:
        fetal_qnames_set, shared_qnames_set = con.getFetalSharedQnames()

    for tup in pool.imap_unordered(get_qnames_and_alleles_with_args,
                                   db_files[1:], 100):
        shared_lengths = shared_lengths.add(tup[0], fill_value=0)
        fetal_lengths = fetal_lengths.add(tup[1], fill_value=0)
        if qnames:
            fetal_qnames_set.update(tup[3])
            shared_qnames_set.update(tup[2])

    # for db_path in db_files[1:]:
    # 	con = db.Variants(db_path, probe=False)
    # 	shared_lengths = shared_lengths.add(con.getSharedLengths(), fill_value=0)
    # 	fetal_lengths = fetal_lengths.add(con.getFetalLengths(), fill_value=0)
    # 	if qnames:
    # 		tup = con.getFetalSharedQnames()
    # 		shared_qnames_set.update(tup[1])
    # 		fetal_qnames_set.update(tup[0])

    if qnames:
        with open('shared_qnames_list.txt', 'w') as f:
            for q in shared_qnames_set:
                print(q, file=f)
        with open('fetal_qnames_list.txt', 'w') as f:
            for q in fetal_qnames_set:
                print(q, file=f)

    # with pd.option_context('display.max_rows', None):
    # pulled_lengths = shared_lengths.add(fetal_lengths, fill_value=0)
    # pulled_lengths = pulled_lengths[pulled_lengths.index < 1001]
    # pulled_lengths_densities = pulled_lengths / pulled_lengths.sum()
    # printverbose('pulled_lengths_densities')
    # lpulled = list(pulled_lengths_densities['COUNT(length)'])
    # lpulled = list(pulled_lengths_densities['COUNT(length)'])
    # printverbose(lpulled)

    # zeros = pd.DataFrame(np.zeros((1001,1)))

    # printverbose('maternal lengths')
    # maternal_lengths = shared_lengths - fetal_lengths
    # maternal_lengths = maternal_lengths.add(zeros, fill_value=0)
    # maternal_lengths = maternal_lengths.fillna(0).clip(lower = 0)
    # maternal_lengths = maternal_lengths[maternal_lengths.index < 1001]
    # printverbose(list(maternal_lengths['COUNT(length)']))

    # printverbose('fetal lengths')
    # fetal_lengths = fetal_lengths.add(zeros, fill_value=0)
    # fetal_lengths = fetal_lengths[fetal_lengths.index < 1001]
    # printverbose(list(fetal_lengths['COUNT(length)']))

    return (shared_lengths, fetal_lengths)
Пример #43
0
def main():
    parser = argparse.ArgumentParser(description='Compares all entries in a '
                                     'fasta file using MASH')

    main_options = parser.add_argument_group('Main options')
    main_options.add_argument('-i',
                              '--input_references',
                              dest='inputfile',
                              nargs='+',
                              required=True,
                              help='Provide the  '
                              'input fasta '
                              'files  to  '
                              'parse.')
    main_options.add_argument('-o',
                              '--output',
                              dest='output_tag',
                              required=True,
                              help='Provide an output tag.')
    main_options.add_argument('-t',
                              '--threads',
                              dest='threads',
                              default="1",
                              help='Provide the number of threads to be used. '
                              'Default: 1.')

    mash_options = parser.add_argument_group('MASH related options')
    mash_options.add_argument(
        '-k',
        '--kmers',
        dest='kmer_size',
        default="21",
        help='Provide the number of k-mers to be provided to mash '
        'sketch. Default: 21.')
    mash_options.add_argument('-p',
                              '--pvalue',
                              dest='pvalue',
                              default="0.05",
                              help='Provide the p-value to '
                              'consider a distance '
                              'significant. Default: '
                              '0.05.')
    mash_options.add_argument('-md',
                              '--mashdist',
                              dest='mashdistance',
                              default="0.1",
                              help='Provide the maximum mash '
                              'distance to be parsed to '
                              'the matrix. Default: 0.1.')

    other_options = parser.add_argument_group('Other options')
    other_options.add_argument('-rm',
                               '--remove',
                               dest='remove',
                               action='store_true',
                               help='Remove any temporary '
                               'files and folders not '
                               'needed (not present '
                               'in results '
                               'subdirectory).')
    other_options.add_argument('-hist',
                               '--histograms',
                               dest='histograms',
                               action='store_true',
                               help='Checks the '
                               'distribution of '
                               'distances values  '
                               'plotting histograms')
    args = parser.parse_args()

    threads = args.threads
    kmer_size = args.kmer_size
    pvalue = args.pvalue
    mashdist = args.mashdistance

    ## lists all fastas given to argparser
    fastas = [
        f for f in args.inputfile
        if f.endswith((".fas", ".fasta", ".fna", ".fsa", ".fa"))
    ]

    ## creates output directory tree
    output_tag = args.output_tag.replace("/", "")  ## if the user gives and
    # input tag that is already a folder
    mother_directory = output_tree(fastas[0], output_tag)

    ## checks if multiple fastas are provided or not avoiding master_fasta
    # function
    print("***********************************")
    print("Creating main database...\n")
    main_fasta, sequence_info = master_fasta(fastas, output_tag,
                                             mother_directory)

    #########################
    ### genera block here ###
    #########################

    ## runs mash related functions
    print("***********************************")
    print("Sketching reference...\n")
    ref_sketch = sketch_references(main_fasta, output_tag, threads, kmer_size,
                                   mother_directory)

    ## breaks master fasta into multiple fastas with one genome each
    print("***********************************")
    print("Making temporary files for each genome in fasta...\n")
    genomes = genomes_parser(main_fasta, output_tag, mother_directory)

    ## This must be multiprocessed since it is extremely fast to do mash
    # against one plasmid sequence
    print("***********************************")
    print("Sketching genomes and running mash distances...\n")

    pool = Pool(int(threads))  # Create a multiprocessing Pool
    mp = pool.imap_unordered(
        partial(multiprocess_mash, ref_sketch, main_fasta, output_tag,
                kmer_size, mother_directory),
        genomes)  # process genomes iterable with pool

    ## loop to print a nice progress bar
    try:
        for _ in tqdm.tqdm(mp, total=len(genomes)):
            pass
    except:
        print("progress will not be tracked because of 'reasons'... check if "
              "you have tqdm package installed.")
    pool.close()
    pool.join()  ## needed in order for the process to end before the
    # remaining options are triggered
    # print
    # print "Finished MASH... uf uf uf!"

    ## Makes distances matrix csv file
    # print
    # print "***********************************"
    # print "Creating distance matrix..."
    # print
    lists_traces = mash_distance_matrix(mother_directory, sequence_info,
                                        pvalue, mashdist, threads)

    ## remove master_fasta
    if args.remove:
        # print "***********************************"
        # print "Removing temporary files and folders..."
        # print
        os.remove(main_fasta)
        for d in os.listdir(mother_directory):
            if d != "results":
                shutil.rmtree(os.path.join(mother_directory, d))

    ## Histograms
    if args.histograms:
        # print "***********************************"
        # print "Outputing histograms..."
        # print
        plot_histogram(lists_traces, output_tag, mother_directory)
Пример #44
0
        dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode')
    ]
    dataset = build_dataset(cfg.data[args.split],
                            dict(test_mode=(args.split != 'train')))

    # prepare for checking
    if os.path.exists(args.output_file):
        # remove exsiting output file
        os.remove(args.output_file)
    pool = Pool(args.num_processes)
    lock = Manager().Lock()
    worker_fn = partial(_do_check_videos, lock, dataset, args.output_file)
    ids = range(len(dataset))

    # start checking
    for _ in tqdm(pool.imap_unordered(worker_fn, ids), total=len(ids)):
        pass
    pool.join()

    # print results and release resources
    pool.close()
    with open(args.output_file, 'r') as f:
        print(f'Checked {len(dataset)} videos, '
              f'{len(f)} is/are corrupted/missing.')

    if args.remove_corrupted_videos:
        print('Start deleting corrupted videos')
        cnt = 0
        with open(args.output_file, 'r') as f:
            for line in f:
                if os.path.exists(line.strip()):
Пример #45
0
        if len(times[x]) == runs and x not in printed_res:
            human_r = str(
                datetime.timedelta(seconds=sum(times[x]) / len(times[x])))
            print(
                str(x).ljust(padding),
                str(round(statistics.mean(times[x]), 5)).ljust(padding),
                str(round(statistics.stdev(times[x]), 5)).ljust(padding),
                human_r.ljust(padding))
            printed_res.add(x)
    sys.stdout.flush()


args = parse_args()
times = defaultdict(list)
graphs = defaultdict(list)
printed_res = set()

arg_list = []
for x in range(args.s, args.e + 1, args.d):
    arg_list += [x] * args.r
pool = Pool(args.p)
print_res(times, args.r)
for x, g, t in pool.imap_unordered(gen_graph, arg_list):
    times[x].append(t)
    graphs[x].append(g)
    print_res(times, args.r)
    if args.S:
        nx.write_graphml(
            g,
            "%s/internet-AS-graph-%d-%d.graphml" % (args.S, x, len(graphs[x])))
Пример #46
0
                                                 pblh[ lista ], num_pbl[ lista ] ) ) )
        elif DAY and (not explicitAerosol):
            namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \
                                                 pblh[ lista ], num_pbl[ lista ], cntlat[ lista ] ) ) )
        elif (not DAY) and explicitAerosol:
            namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \
                                                 pblh[ lista ], num_ks[ lista ], num_as[ lista ], num_cs[ lista ], dpg_as[ lista ] ) ))
        elif DAY and explicitAerosol:
            namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \
                                                 pblh[ lista ], num_ks[ lista ], num_as[ lista ], num_cs[ lista ], dpg_as[ lista ], cntlat[ lista ] ) ) )

        sound_in_iter = iter( np.column_stack( ( pres0[ lista ], windprofile[ lista ], case[ lista ], q_inv[ lista ], tpot_inv[ lista ], q_pbl[ lista ], tpot_pbl[ lista ], pblh[ lista ], \
                                                   dz[ lista ], nzp[ lista ] ) ) )

        # run as unordered parallel processes
        for k in pool.imap_unordered(write_namelist, namelist_iter):
            pass
        for i in pool.imap_unordered(write_sound_in, sound_in_iter):
            pass


#def dycoms():
#    call(['rm','-rf', rootfolder+'*'])
#    case = 'dycoms'
#    q_inv = 4.45
#    tpot_inv = 6.7
#    q_pbl = 9.45
#    tpot_pbl = 288.3
#    pblh = 795.
#    write_sound_in( case, q_inv, tpot_inv, q_pbl, tpot_pbl, pblh)
#    write_namelist( case, 20., 660. )
Пример #47
0
def _extract_features_parallel_per_kind(kind_to_df_map,
                                        column_id, column_value,
                                        default_fc_parameters,
                                        kind_to_fc_parameters=None,
                                        chunksize=defaults.CHUNKSIZE,
                                        n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                                        disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                                        impute_function=defaults.IMPUTE_FUNCTION):
    """
    Parallelize the feature extraction per kind.

    :param kind_to_df_map: The time series to compute the features for in our internal format
    :type kind_to_df_map: dict of pandas.DataFrame

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_processes: The number of processes to use for parallelisation.
    :type n_processes: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or function

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series,
                                                           column_id=column_id,
                                                           column_value=column_value,
                                                           default_fc_parameters=default_fc_parameters,
                                                           kind_to_fc_parameters=kind_to_fc_parameters,
                                                           show_warnings=show_warnings)
    pool = Pool(n_processes)

    if not chunksize:
        chunksize = _calculate_best_chunksize(kind_to_df_map, n_processes)

    total_number_of_expected_results = len(kind_to_df_map)
    extracted_features = tqdm(pool.imap_unordered(partial_extract_features_for_one_time_series, kind_to_df_map.items(),
                                                  chunksize=chunksize), total=total_number_of_expected_results,
                              desc="Feature Extraction", disable=disable_progressbar)
    pool.close()

    # Concatenate all partial results
    result = pd.concat(extracted_features, axis=1, join='outer').astype(np.float64)

    # Impute the result if requested
    if impute_function is not None:
        impute_function(result)

    pool.join()
    return result
                               ph_sub['n_cases'], ph_sub['n_controls'])

    # bake in globals
    ldsc_h2_map = partial(ldsc_h2_part,
                          wd=wd,
                          ld_ref_panel=ld_ref_panel,
                          ld_w_panel=ld_w_panel,
                          ld_frq_panel=ld_frq_panel,
                          ss_bucket=ss_bucket,
                          sex_group=str(sex_group))

    # dispatch
    print "Starting ldsc..."
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    pool = Pool(num_proc)
    results = pool.imap_unordered(ldsc_h2_map, iter_args)
    pool.close()
    pool.join()

    ####
    # Load output to dataframe
    print "Processing results..."
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    ####
    dat = pd.concat(results)
    # dat = pd.DataFrame(index=ph_list, columns=col_ord)
    # for res in results:
    #     dat.update(pd.DataFrame(data=res, index=pd.Series(res[0][0]), columns=col_ord))

    ####
    # write results to file
Пример #49
0
def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    lang_dict = load_dict(args.language)

    # Create font (path) list
    if len(args.font_type) == 0:
        fonts = load_fonts(args.language)
    else:
        font = args.font_type

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != '':
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        strings = create_strings_randomly(args.length, args.random, args.count,
                                          args.include_letters,
                                          args.include_numbers,
                                          args.include_symbols, args.language)
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (args.include_letters,
                                                args.include_numbers,
                                                args.include_symbols):
            args.name_format = 2
    else:
        strings = create_strings_from_dict(args.length, args.random,
                                           args.count, lang_dict)

    string_count = len(strings)

    p = Pool(args.thread_count)
    if len(args.font_type) == 0:
        for _ in tqdm(p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip([i for i in range(0, string_count)], strings, [
                    fonts[random.randrange(0, len(fonts))]
                    for _ in range(0, string_count)
                ], [args.output_dir] * string_count, [args.format] *
                    string_count, [args.extension] * string_count,
                    [args.skew_angle] * string_count, [args.random_skew] *
                    string_count, [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count, [args.name_format] *
                    string_count, [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count)),
                      total=args.count):
            pass
    else:
        for _ in tqdm(p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip([i for i in range(0, string_count)], strings,
                    [font for _ in range(0, string_count)], [args.output_dir] *
                    string_count, [args.format] * string_count,
                    [args.extension] * string_count, [args.skew_angle] *
                    string_count, [args.random_skew] * string_count,
                    [args.blur] * string_count, [args.random_blur] *
                    string_count, [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count, [args.name_format] *
                    string_count, [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count)),
                      total=args.count):
            pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  'w',
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{} {}\n".format(file_name, strings[i]))
Пример #50
0
def testmain(**argdict):
    argdict = defaultdict(lambda: None, argdict)
    scriptdir = os.path.dirname(os.path.realpath(sys.argv[0]))+"/"
    samplefilename = argdict["samplefile"]
    
    sampledata = samplefile(argdict["samplefile"])
    trnafile = argdict["trnafile"]
    logfile = argdict["logfile"]
    mapfile = argdict["mapfile"]
    bowtiedb = argdict["bowtiedb"]
    lazycreate = argdict["lazy"]
    minnontrnasize = argdict["minnontrnasize"]
    bamdir = argdict["bamdir"]
    trnamapfile = argdict["trnamapfile"]
    if bamdir is None:
        bamdir = "./"
    
    if "cores" in argdict:
        cores = int(argdict["cores"])
    else:
        cores = min(8,cpu_count())
    #sys.exit()
    print >>sys.stderr,"cores: "+str(cores)
    workingdir = bamdir
    #samplefile = open(args.samplefile)
    
    samples = sampledata.getsamples()
    
    trnafile = trnafile
    print >>sys.stderr, "logging to "+logfile
    if logfile and lazycreate:
        logfile = open(logfile,'a')
        print >>logfile, "New mapping"
    elif logfile:
        logfile = open(logfile,'w')
    else:
        logfile = sys.stderr

    unmaps = defaultdict(int)
    singlemaps = defaultdict(int)
    multimaps = defaultdict(int)
    totalreads = defaultdict(int)
    
    if not os.path.isfile(bowtiedb+".fa"):
        print >>sys.stderr, "No bowtie2 database "+bowtiedb
        sys.exit(1)
    badsamples = list()
    for samplename in samples:
        bamfile = workingdir+samplename
        
        if lazycreate and os.path.isfile(bamfile+".bam"):   
            if not checkheaders(bamfile+".bam", sampledata.getfastq(samplename)):
                badsamples.append(bamfile+".bam")

                
            
        else:
            if os.path.isfile(bamfile+".bam"):

                if not checkheaders(bamfile+".bam", sampledata.getfastq(samplename)):
                    badsamples.append(bamfile+".bam")
    
    if len(badsamples) > 0:
        print >>sys.stderr, "Bam files "+",".join(badsamples)+" does not match fq files"
        print >>sys.stderr, "Aborting"
        sys.exit(1)               
    #'samtools sort -T '+tempfile.gettempdir()+"/"+outfile+'temp - -o '+outfile+'.bam'
    tempfilesover = list()
    missingfqfiles = list()
    for samplename in samples:
        #redundant but ensures compatibility
        bamfile = workingdir+samplename
        temploc = os.path.basename(bamfile)
        #print >>sys.stderr, "***"
        #print >>sys.stderr, samplename+'temp'
        
        for currfile in os.listdir(tempfile.gettempdir()):
            #
            if currfile.startswith(samplename+'temp'):
                tempfilesover.append(currfile)
        fqfile = sampledata.getfastq(samplename)
        if not os.path.isfile(fqfile):
            missingfqfiles.append(fqfile)
    if len(tempfilesover) > 0:
        for currfile in tempfilesover:
            print >>sys.stderr, tempfile.gettempdir() +"/"+ currfile + " temp bam files exists"
        print >>sys.stderr, "these files must be deleted to proceed"
        sys.exit(1)
    if len(missingfqfiles) > 0:
        print >>sys.stderr, ",".join(missingfqfiles) + " fastq files missing"
        sys.exit(1)
    mapresults = dict()
    multithreaded = True
    if multithreaded:
        mapargs = list()
        print >>sys.stderr, cores
        mappool = Pool(processes=cores)
        mapsamples = list()
        for samplename in samples:
            bamfile = workingdir+samplename
            
            if lazycreate and os.path.isfile(bamfile+".bam"):
                pass

                print >>sys.stderr, "Skipping "+samplename

            else:
                mapargs.append(compressargs(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile, expname = samplefilename, samplename = samplename, minnontrnasize = minnontrnasize))
                
                
                #mapresults[samplename] = mapreads(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile,  logfile=logfile, expname = samplefilename)
                mapsamples.append(samplename)
        #results = mappool.map(mapreadspool, mapargs)
        starttime = time.time()
        for currresult in mappool.imap_unordered(mapreadspool, mapargs):
            #print >>sys.stderr, "time "+currresult.samplename+": "+str(time.time() - starttime)
            if currresult.failedrun == True:
                print >>sys.stderr, "Failure to Bowtie2 map"
                #print >>sys.stderr, output[1]
                currresult.printbowtie(logfile)
                sys.exit(1)
            mapresults[currresult.samplename] = currresult
            currresult.printbowtie(logfile)
                
    else:
        for samplename in samples:
            bamfile = workingdir+samplename
            
            if lazycreate and os.path.isfile(bamfile+".bam"):
                pass
                    
                print >>sys.stderr, "Skipping "+samplename
                
            else:
        
                mapresults[samplename] = mapreads(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile,  logfile=logfile, expname = samplefilename, minnontrnasize = minnontrnasize)

    if lazycreate:
        #here is where I might add stuff to read old files in lazy mode
        pass
    if mapfile is not None and not lazycreate:
        mapinfo = open(mapfile,'w')                
        print >>mapinfo, "\t".join(samples)
        print >>mapinfo, "unmap\t"+"\t".join(str(mapresults[currsample].unmaps) for currsample in samples)
        print >>mapinfo, "single\t"+"\t".join(str(mapresults[currsample].singlemaps) for currsample in samples)
        print >>mapinfo, "multi\t"+"\t".join(str(mapresults[currsample].multimaps) for currsample in samples)
        mapinfo.close()
        
    if trnamapfile is not None and not lazycreate:
        trnamapinfo = open(trnamapfile,'w')      
        
        print >>trnamapinfo, "\t".join(samples)
        print >>trnamapinfo, "multi_nontRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multiplenon) for currsample in samples)
        print >>trnamapinfo, "unique_nontRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.singlenon) for currsample in samples)
        print >>trnamapinfo, "multi_amino\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multamino) for currsample in samples)
        print >>trnamapinfo, "unique_amino\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multac) for currsample in samples)
        print >>trnamapinfo, "unique_anticodon\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multtrans) for currsample in samples)
        print >>trnamapinfo, "unique_tRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.singletrna) for currsample in samples)
        


        #print >>mapinfo, "total\t"+"\t".join(totalreads[currsample] for currsample in samples)
        trnamapinfo.close()
        
        
        #print >>logfile, "Processing "+samplename +" mappings"
    logfile.close()
Пример #51
0
def _maybe_convert_sets(target_dir, extracted_data):
    extracted_dir = os.path.join(target_dir, extracted_data)
    # override existing CSV with normalized one
    target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME,
                                       ARCHIVE_NAME.replace(".tgz", "_{}.csv"))
    if os.path.isfile(target_csv_template):
        return

    wav_root_dir = os.path.join(extracted_dir)

    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    glob_dir = os.path.join(wav_root_dir, "**/metadata.csv")
    for record in glob(glob_dir, recursive=True):
        if any(map(lambda sk: sk in record, SKIP_LIST)):  # pylint: disable=cell-var-from-loop
            continue
        with open(record, "r") as rec:
            for re in rec.readlines():
                re = re.strip().split("|")
                audio = os.path.join(os.path.dirname(record), "wavs",
                                     re[0] + ".wav")
                transcript = re[2]
                samples.append((audio, transcript))

    counter = get_counter()
    num_samples = len(samples)
    rows = []

    print("Importing WAV files...")
    pool = Pool()
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, processed in enumerate(pool.imap_unordered(one_sample, samples),
                                  start=1):
        counter += processed[0]
        rows += processed[1]
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    with open(target_csv_template.format("train"),
              "w",
              encoding="utf-8",
              newline="") as train_csv_file:  # 80%
        with open(target_csv_template.format("dev"),
                  "w",
                  encoding="utf-8",
                  newline="") as dev_csv_file:  # 10%
            with open(target_csv_template.format("test"),
                      "w",
                      encoding="utf-8",
                      newline="") as test_csv_file:  # 10%
                train_writer = csv.DictWriter(train_csv_file,
                                              fieldnames=FIELDNAMES)
                train_writer.writeheader()
                dev_writer = csv.DictWriter(dev_csv_file,
                                            fieldnames=FIELDNAMES)
                dev_writer.writeheader()
                test_writer = csv.DictWriter(test_csv_file,
                                             fieldnames=FIELDNAMES)
                test_writer.writeheader()

                for i, item in enumerate(rows):
                    transcript = validate_label(item[2])
                    if not transcript:
                        continue
                    wav_filename = item[0]
                    i_mod = i % 10
                    if i_mod == 0:
                        writer = test_writer
                    elif i_mod == 1:
                        writer = dev_writer
                    else:
                        writer = train_writer
                    writer.writerow(
                        dict(
                            wav_filename=os.path.relpath(
                                wav_filename, extracted_dir),
                            wav_filesize=os.path.getsize(wav_filename),
                            transcript=transcript,
                        ))

    imported_samples = get_imported_samples(counter)
    assert counter["all"] == num_samples
    assert len(rows) == imported_samples

    print_import_report(counter, SAMPLE_RATE, MAX_SECS)
Пример #52
0
def parallel(r,c):
  pool2=Pool(processes=8)
  for i in pool2.imap_unordered(f, range(r),c):
    print(i)
  pool2.close()
  pool2.join()
Пример #53
0
def filter_by_metadata(ab):
    venues = []
    with gzip.open(ab) as f:
        for i, l in enumerate(f):
            metadata = json.loads(l.strip())
            if article_allowed(metadata):
                venues.append((ab, l))
    return venues


if __name__ == "__main__":
    # Get all of the statistics for venues, also time how long it takes to iterate through all the data
    start = time.time()
    (data_loc / "filtered_metadata").mkdir(exist_ok=True)

    article_bundles = []
    for article_bundle in data_loc.glob(f"metadata/*.gz"):
        article_bundles.append(article_bundle)

    pool = Pool(8)
    venue_frequencies = defaultdict(int)
    for vf in tqdm(pool.imap_unordered(filter_by_metadata, article_bundles),
                   total=100):
        with gzip.open(f"{data_loc}/filtered_metadata/{vf[0][0].name}",
                       'w') as f:
            for l in vf:
                f.write(l[1])
    pool.close()
    pool.join()
Пример #54
0
    def download_license(threads=os.cpu_count(), force=False):
        """
    Downloads license data from spdx.org.

    Lists data from https://spdx.org/licenses/licenses.json, https://spdx.org/licenses/exceptions.json and check if
    the version is already loaded. If the data already exists, simply skip
    else create a new CSV. CSV file names are created as
    <releaseDate>_<version>.csv. For each license, shortname, fullname, text,
    url, deprecated, osi_approved are collected.

    :param threads: Number of CPU to be used for downloading. This is done to speed up the process
    :param force: Bool value if licenses needs to be downloaded forcefully
    :return: File path if success, None otherwise.
    """
        jsonData = request.urlopen(
            'https://spdx.org/licenses/licenses.json').read()
        jsonData = json.loads(jsonData.decode('utf-8'))
        licenses = jsonData.get('licenses')

        jsonData_exceptions = request.urlopen(
            'https://spdx.org/licenses/exceptions.json').read()
        jsonData_exceptions = json.loads(jsonData_exceptions.decode('utf-8'))
        license_exceptions = jsonData_exceptions.get('exceptions')

        version = jsonData.get('licenseListVersion').replace(".", "_")
        releaseDate = jsonData.get('releaseDate')
        if licenses is not None:
            fileName = releaseDate + '_' + version + '.csv'
            dir = os.path.dirname(os.path.abspath(__file__))
            dir = os.path.abspath(dir + "/../../licenses")
            Path(dir).mkdir(exist_ok=True)
            filePath = Path(os.path.abspath(dir + "/" + fileName))
            if filePath.is_file():
                if (force):
                    filePath.unlink()
                else:
                    return str(filePath)
            licenseDataFrame = pd.DataFrame(columns=csvColumns)
            cpuCount = os.cpu_count()
            threads = cpuCount * 2 if threads > cpuCount * 2 else threads
            pool = ThreadPool(threads)
            for row in tqdm(pool.imap_unordered(
                    LicenseDownloader.fetch_exceptional_license,
                    license_exceptions),
                            desc="Exceptions processed",
                            total=len(license_exceptions),
                            unit="exception"):
                licenseDataFrame = pd.concat([licenseDataFrame, row],
                                             sort=False,
                                             ignore_index=True)
            for row in tqdm(pool.imap_unordered(
                    LicenseDownloader.fetch_license, licenses),
                            desc="Licenses processed",
                            total=len(licenses),
                            unit="license"):
                licenseDataFrame = pd.concat([licenseDataFrame, row],
                                             sort=False,
                                             ignore_index=True)

            licenseDataFrame = licenseDataFrame.drop_duplicates(
                subset='shortname')
            licenseDataFrame = licenseDataFrame.sort_values(
                'deprecated').drop_duplicates(subset='fullname', keep='first')
            licenseDataFrame = licenseDataFrame.sort_values(
                'shortname').reset_index(drop=True)
            licenseDataFrame.to_csv(str(filePath),
                                    index=False,
                                    encoding='utf-8')
            return str(filePath)
        else:
            return None
Пример #55
0
def run(parser=None):
    """Access to the "run" interface of an operations module.

    Executing this function within a module will start a command line interface,
    that can be used to execute operations defined within the same module.
    All **top-level unary functions** will be intepreted as executable operation functions.

    For example, if we have a module as such:

    .. code-block:: python

        # operations.py

        def hello(job):
            print('hello', job)

        if __name__ == '__main__':
            import flow
            flow.run()

    Then we can execute the ``hello`` operation for all jobs from the command like like this:

    .. code-block:: bash

        $ python operations.py hello

    .. note::

        You can control the degree of parallelization with the ``--np`` argument.


    For more information, see:

    .. code-block:: bash

        $ python operations.py --help
    """
    if parser is None:
        parser = argparse.ArgumentParser()

    parser.add_argument('operation',
                        type=str,
                        choices=list(_get_operations()),
                        help="The operation to execute.")
    parser.add_argument(
        'jobid',
        type=str,
        nargs='*',
        help="The job ids, as registered in the signac project. "
        "Omit to default to all statepoints.")
    parser.add_argument(
        '--np',
        type=int,
        default=1,
        help="Specify the number of cores to parallelize to (default=1) or 0 "
        "to parallelize on as many cores as there are available.")
    parser.add_argument(
        '-t',
        '--timeout',
        type=int,
        help="A timeout in seconds after which the parallel execution "
        "of operations is canceled.")
    parser.add_argument('--progress',
                        action='store_true',
                        help="Display a progress bar during execution.")
    args = parser.parse_args()

    project = get_project()

    def _open_job_by_id(_id):
        try:
            return project.open_job(id=_id)
        except KeyError:
            msg = "Did not find job corresponding to id '{}'.".format(_id)
            raise KeyError(msg)
        except LookupError:
            raise LookupError("Multiple matches for id '{}'.".format(_id))

    if len(args.jobid):
        try:
            jobs = [_open_job_by_id(jid) for jid in args.jobid]
        except (KeyError, LookupError) as e:
            print(e, file=sys.stderr)
            sys.exit(1)
    else:
        jobs = project

    module = inspect.getmodule(inspect.currentframe().f_back)
    try:
        operation_func = getattr(module, args.operation)
    except AttributeError:
        raise KeyError("Unknown operation '{}'.".format(args.operation))

    if getattr(operation_func, '_flow_cmd', False):

        def operation(job):
            cmd = operation_func(job).format(job=job)
            fork(cmd=cmd, timeout=args.timeout)
    else:
        operation = operation_func

    # Serial execution
    if args.np == 1 or len(jobs) < 2:
        if args.timeout is not None:
            logger.warning("A timeout has no effect in serial execution!")
        for job in tqdm(jobs) if args.progress else jobs:
            operation(job)

    # Parallel execution
    elif six.PY2:
        # Due to Python 2.7 issue #8296 (http://bugs.python.org/issue8296) we
        # always need to provide a timeout to avoid issues with "hanging"
        # processing pools.
        timeout = sys.maxint if args.timeout is None else args.timeout
        pool = Pool(args.np)
        result = pool.imap_unordered(operation, jobs)
        for _ in tqdm(jobs) if args.progress else jobs:
            result.next(timeout)
    else:
        with Pool(args.np) as pool:
            result = pool.imap_unordered(operation, jobs)
            for _ in tqdm(jobs) if args.progress else jobs:
                result.next(args.timeout)
Пример #56
0
        return port, True
    except (socket.timeout, socket.error):
        return port, False


if __name__ == '__main__':
    if not len(sys.argv):
        print("Usage: scanner.py <target> <maxport>")
    if len(sys.argv) == 3:
        maxport = int(sys.argv[2])
    else:
        maxport = 1025

    target = sys.argv[1]

    # Resolve Host to IP, if necessary.
    if not target.replace(".", "").isdigit():
        target = host_to_ip(target)
    print("[+] Scanning", target)

    ports = range(1, maxport + 1)
    scanlist = [(target, port) for port in ports]

    # Use 512 workers. Not sure how insane that is but it seems to work fine.
    pool = Pool(512)

    for port, status in pool.imap_unordered(scan, scanlist):
        if status:
            print("[!]", port, "is open")
    print("[+] Finished scanning", target)
Пример #57
0
        text_dict = ljspeech(path)
    if args.dataset == 'databaker':
        text_dict = databaker(path)

    n_workers = max(1, args.num_workers)

    simple_table([('Sample Rate', hp.sample_rate), ('Bit Depth', hp.bits),
                  ('Mu Law', hp.mu_law), ('Hop Length', hp.hop_length),
                  ('CPU Usage', f'{n_workers}/{cpu_count()}'),
                  ('Num Validation', hp.n_val)])

    pool = Pool(processes=n_workers)
    dataset = []
    cleaned_texts = []
    for i, (item_id, length, cleaned_text) in enumerate(
            pool.imap_unordered(process_wav, wav_files), 1):
        if item_id in text_dict:
            dataset += [(item_id, length)]
            cleaned_texts += [(item_id, cleaned_text)]
        bar = progbar(i, len(wav_files))
        message = f'{bar} {i}/{len(wav_files)} '
        stream(message)

    random = Random(hp.seed)
    random.shuffle(dataset)
    train_dataset = dataset[hp.n_val:]
    val_dataset = dataset[:hp.n_val]
    # sort val dataset longest to shortest
    val_dataset.sort(key=lambda d: -d[1])

    for id, text in cleaned_texts:
Пример #58
0
from multiprocessing import Pool, TimeoutError
import time
import os

def f(x):
    return x*x

if __name__ == '__main__':
    pool = Pool(processes=4)              # start 4 worker processes

    # print "[0, 1, 4,..., 81]"
    print pool.map(f, range(10))

    # print same numbers in arbitrary order
    for i in pool.imap_unordered(f, range(10)):
        print i,
    print

    # evaluate "f(20)" asynchronously
    res = pool.apply_async(f, (20,))      # runs in *only* one process
    print res.get(timeout=1)              # prints "400"

    # evaluate "os.getpid()" asynchronously
    res = pool.apply_async(os.getpid, ()) # runs in *only* one process
    print res.get(timeout=1)              # prints the PID of that process

    # launching multiple evaluations asynchronously *may* use more processes
    multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)]
    print [res.get(timeout=1) for res in multiple_results]

    # make a single worker sleep for 10 secs
Пример #59
0
    background.save(f'{savedir}/{i}.png')


pool = Pool(args.thread_count)
count = len(font_list) * args.images

font_names = []
font_vers = []
for key, value in font_list.items():
    font_names.extend([key]*args.images)
    key_len = len(value)
    font_vers.extend(value * (args.images // key_len))
    font_vers.extend(value[:args.images % key_len])

for _ in tqdm(
    pool.imap_unordered(
        generate_from_tuple,
        zip(
            [ i for i in range(count) ],
            [ args.font_folder ] * count,
            font_names,
            font_vers,
            [ random_string_from_dict() for _ in range(count) ],
            [ args.output_dir ] * count
        )
    ),
    total=count
): pass

pool.terminate()
Пример #60
0
def _extract_features_parallel_per_sample(kind_to_df_map,
                                          column_id, column_value,
                                          default_fc_parameters,
                                          kind_to_fc_parameters=None,
                                          chunksize=defaults.CHUNKSIZE,
                                          n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS,
                                          disable_progressbar=defaults.DISABLE_PROGRESSBAR,
                                          impute_function=defaults.IMPUTE_FUNCTION):
    """
    Parallelize the feature extraction per kind and per sample.

    As the splitting of the dataframes per kind along column_id is quite costly, we settled for an async map in this
    function. The result objects are temporarily stored in a fifo queue from which they can be retrieved in order
    of submission.

    :param kind_to_df_map: The time series to compute the features for in our internal format
    :type kind_to_df_map: dict of pandas.DataFrame

    :param column_id: The name of the id column to group by.
    :type column_id: str

    :param column_value: The name for the column keeping the value itself.
    :type column_value: str

    :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names
           which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for
           more information.
    :type default_fc_parameters: dict

    :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for
            default_fc_parameters. If you put a kind as a key here, the fc_parameters
            object (which is the value), will be used instead of the default_fc_parameters.
    :type kind_to_fc_parameters: dict

    :param chunksize: The size of one chunk for the parallelisation
    :type chunksize: None or int

    :param n_processes: The number of processes to use for parallelisation.
    :type n_processes: int

    :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators).
    :type show_warnings: bool

    :param disable_progressbar: Do not show a progressbar while doing the calculation.
    :type disable_progressbar: bool

    :param impute_function: None, if no imputing should happen or the function to call for imputing.
    :type impute_function: None or function

    :return: The (maybe imputed) DataFrame containing extracted features.
    :rtype: pandas.DataFrame
    """
    partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series,
                                                           column_id=column_id,
                                                           column_value=column_value,
                                                           default_fc_parameters=default_fc_parameters,
                                                           kind_to_fc_parameters=kind_to_fc_parameters,
                                                           show_warnings=show_warnings)
    pool = Pool(n_processes)
    total_number_of_expected_results = 0

    # Submit map jobs per kind per sample
    results_fifo = Queue()

    for kind, df_kind in kind_to_df_map.items():
        df_grouped_by_id = df_kind.groupby(column_id)

        total_number_of_expected_results += len(df_grouped_by_id)

        if not chunksize:
            chunksize = _calculate_best_chunksize(df_grouped_by_id, n_processes)

        results_fifo.put(
            pool.imap_unordered(
                partial_extract_features_for_one_time_series,
                [(kind, df_group) for _, df_group in df_grouped_by_id],
                chunksize=chunksize
            )
        )

    pool.close()

    # Wait for the jobs to complete and concatenate the partial results
    dfs_per_kind = []

    # Do this all with a progress bar
    with tqdm(total=total_number_of_expected_results, desc="Feature Extraction",
              disable=disable_progressbar) as progress_bar:
        # We need some sort of measure, when a new result is there. So we wrap the
        # map_results into another iterable which updates the progress bar each time,
        # a new result is there
        def iterable_with_tqdm_update(queue, progress_bar):
            for element in queue:
                progress_bar.update(1)
                yield element

        result = pd.DataFrame()
        while not results_fifo.empty():
            map_result = results_fifo.get()
            dfs_kind = iterable_with_tqdm_update(map_result, progress_bar)
            df_tmp = pd.concat(dfs_kind, axis=0).astype(np.float64)

            # Impute the result if requested
            if impute_function is not None:
                impute_function(df_tmp)

            result = pd.concat([result, df_tmp], axis=1).astype(np.float64)

    pool.join()
    return result