def _array_parallel(fn, cls, genelist, chunksize=250, processes=1, **kwargs): """ Returns an array of genes in `genelist`, using `bins` bins. `genelist` is a list of pybedtools.Interval objects Splits `genelist` into pieces of size `chunksize`, creating an array for each chunk and merging ret A chunksize of 25-100 seems to work well on 8 cores. """ pool = multiprocessing.Pool(processes) chunks = list(chunker(genelist, chunksize)) # pool.map can only pass a single argument to the mapped function, so you # need this trick for passing multiple arguments; idea from # http://stackoverflow.com/questions/5442910/ # python-multiprocessing-pool-map-for-multiple-arguments # results = pool.map( func=_array_star, iterable=itertools.izip( itertools.repeat(fn), itertools.repeat(cls), chunks, itertools.repeat(kwargs))) pool.close() pool.join() return results
def _array_parallel(fn, cls, genelist, chunksize=250, processes=1, **kwargs): """ Returns an array of genes in `genelist`, using `bins` bins. `genelist` is a list of pybedtools.Interval objects Splits `genelist` into pieces of size `chunksize`, creating an array for each chunk and merging ret A chunksize of 25-100 seems to work well on 8 cores. """ pool = multiprocessing.Pool(processes) chunks = list(chunker(genelist, chunksize)) # pool.map can only pass a single argument to the mapped function, so you # need this trick for passing multiple arguments; idea from # http://stackoverflow.com/questions/5442910/ # python-multiprocessing-pool-map-for-multiple-arguments # results = pool.map(func=_array_star, iterable=itertools.izip(itertools.repeat(fn), itertools.repeat(cls), chunks, itertools.repeat(kwargs))) pool.close() pool.join() return results
def _count_array_parallel(fn, cls, genelist, chunksize=250, processes=1, **kwargs): pool = multiprocessing.Pool(processes) chunks = list(chunker(genelist, chunksize)) results = pool.map( func=_count_array_star, iterable=itertools.izip( itertools.repeat(fn), itertools.repeat(cls), chunks, itertools.repeat(kwargs))) pool.close() pool.join() return results
def retrieve_pages(self, pagetitles, data, chunk_size=50, delay=0.5): for i, chunk in enumerate(chunker(pagetitles, chunk_size)): show_progress( i * chunk_size + len(chunk), len(pagetitles), "Retrieving chunk '{}'-'{}'".format(chunk[0], chunk[-1])) data["titles"] = "|".join(chunk) response = self.post(self.api_location, data=data) yield response sleep(delay) show_progress(len(pagetitles), len(pagetitles), "Retrieved chunks.", True)
def retrieve_pages(self, pagetitles, data, chunk_size=50, delay=0.5): for i, chunk in enumerate(chunker(pagetitles, chunk_size)): show_progress( i * chunk_size + len(chunk), len(pagetitles), "Retrieving chunk '{}'-'{}'".format(chunk[0], chunk[-1]) ) data["titles"] = "|".join(chunk) response = self.post(self.api_location, data=data) yield response sleep(delay) show_progress(len(pagetitles), len(pagetitles), "Retrieved chunks.", True)
def getWikidataItems(self, allItems): self.fetchWikipediaLanguages() self.wdSite = wikipedia('www', 'wikidata') filename = config['filename-api'] if os.path.isfile(filename): data = eval(open(filename, 'r', encoding='utf-8').read()) self.oneBatch(data, False) else: for group in chunker(allItems, 50): self.oneBatch(group) saveToFile(filename, json.dumps(self.itemData_FULL, ensure_ascii=False))
def get_item_data(self, wd_items, raw=False, attributes = ['sitelinks', 'claims'], claim_props=[]): retMap = {} for batch in chunker(wd_items, 49): res = self.site('wbgetentities', ids=batch, props='|'.join(attributes)) for entity in res.get('entities'): data = res.get('entities').get(entity) tmp_data = {} for attr in attributes: if attr == 'sitelinks': sitelinks = {f:data.get(attr).get(f).get('title') for f in data.get(attr)} data.update({'sitelinks': sitelinks}) if attr == 'claims': claims = clean_api(data.get(attr)) data.update({'claims': claims}) #print(data.get('type')) #parsed_data = clean_api(data) if raw and 'claims' else data retMap.update({entity: data}) return retMap