def send_notification_batch(self, notifications): total = [] for entries_chunk in chunk_list(notifications, 10): total.append(encode_notification_for_sqs(entries_chunk)) for chunk_of_total in chunk_list(total, 10): self.client.send_message_batch( QueueUrl=self.queue_url, Entries=[ self.create_notification_message(e) for e in chunk_of_total ])
def start_scraper(url, q_name, parsers, downloaders): task_q = qhandler.get_task_q() for sections in utils.chunk_list(utils.SECTIONS, parsers): webm_q = qhandler.create_channel(queue_name=q_name) parser.start_thread(task_q, webm_q, url, sections) for _ in range(downloaders): channel = qhandler.create_channel(queue_name=q_name) downloader.start_thread(channel, q_name)
def ncbigene_make(): IDS_FILE = 'gene-subset-ids.txt' with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db':'gene', 'retmode':'json', 'retmax':5000, 'id':None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') prefixes = { 'ilx':'http://uri.interlex.org/base/', 'OBOANN':'http://ontology.neuinfo.org/NIF/Backend/OBO_annotation_properties.owl#', # FIXME needs to die a swift death 'NCBIGene':'http://www.ncbi.nlm.nih.gov/gene/', 'NCBITaxon':'http://purl.obolibrary.org/obo/NCBITaxon_', } ng = makeGraph('ncbigeneslim', prefixes) for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ontid = 'http://ontology.neuinfo.org/NIF/ttl/generated/ncbigeneslim.ttl' ng.add_node(ontid, rdflib.RDF.type, rdflib.OWL.Ontology) ng.add_node(ontid, rdflib.RDFS.label, 'NIF NCBI Gene subset') ng.add_node(ontid, rdflib.RDFS.comment, 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE) ng.add_node(ontid, rdflib.OWL.versionInfo, date.isoformat(date.today())) ng.write()
def ncbigene_make(): IDS_FILE = 'resources/gene-subset-ids.txt' with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db': 'gene', 'retmode': 'json', 'retmax': 5000, 'id': None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') ng = createOntology( 'ncbigeneslim', 'NIF NCBI Gene subset', makePrefixes('ILXREPLACE', 'ilx', 'OBOANN', 'NCBIGene', 'NCBITaxon', 'skos', 'owl'), 'ncbigeneslim', 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE, remote_base='http://ontology.neuinfo.org/NIF/') for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ng.write()
def send_runcommand(ssm_client, **kwargs): # noqa """Takes in a boto3 session and some kwargs, splits list of instances into groups for 50, sends RunCommand, and returns a list of the responses. """ doc = 'AWS-RunShellScript' response = [] chunks = chunk_list(kwargs['instances'], 50) # max 50 instances for chunk in chunks: # iterate over chunks of 50 instances response.append(ssm_client.send_command( DocumentName=doc, InstanceIds=chunk, Parameters={ # value must be a list 'commands': [ "#!/bin/bash", 'bucket={bucket}'.format(**kwargs), 'instance_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)', 'echo $instance_id $bucket', 'if type -t rpm >/dev/null 2>&1;then', (''' pkg_list=$(rpm -qa --queryformat '"%-30{NAME}": ''' '''"%10{VERSION}-%20{RELEASE}",' | sed -e 's~,$~~' | tr -d ' ')'''), ' echo "{${pkg_list}}" | \\', (''' python -c 'import json, sys; print(json.dumps(''' '''json.loads(sys.stdin.read()), indent=4))' > pkg_list.json'''), ' echo Retrieved package list from rpm', 'fi', 'if type -t dpkg >/dev/null 2>&1;then', ' echo "Found debian"', 'fi', 'test -e pkg_list.json || echo unable to find pkg_list.json', 'aws s3 cp pkg_list.json s3://$bucket/patching-state/%s/${instance_id}.json' % ( kwargs['delta_date']), 'echo Completed Export', ], }, OutputS3BucketName=kwargs['bucket'], OutputS3KeyPrefix='command-output', TimeoutSeconds=kwargs['timeout'], MaxErrors='10', )['Command']) # appends command return to a list for return return response
def forward(self, inp_spectrum, inp_wvlens, part_covered=True, tol=0.5, pad=False, ng4=False, invert=True, snr=True, dc=True, smear=True, run_with_binned=True, return_binned=False, run_specs=None, run_specs_inner=None, *args, **kwargs): """ :param inp_spectrum: (batches, channels, spectrum), computations are threaded along batches, all spectra in a channel are computed in vectorized form. This means they should not differ in their wavelength support, but they may differ in intensity. If only one batch is supplied, it is broadcast to all supplied inp_wvlens. :param inp_wvlens: Support for each batch. If only one inp_wvlens is supplied it is broadcast to all batches. :param part_covered: :param tol: :param args: :param kwargs: :return: """ binned = run_with_binned inp_wvlens = np.atleast_2d(inp_wvlens) # reshape input spectrum if len(inp_spectrum.shape) == 2: # we assume (batch, wvl) inp_spectrum = inp_spectrum[:, None, None, ...] elif len(inp_spectrum.shape) == 3: # we assume (batch, channel, wvl) inp_spectrum = inp_spectrum[:, :, None, ...] elif len(inp_spectrum.shape) == 4: # we assume (batch, channel, pix, wvl) pass elif len(inp_spectrum.shape) == 5: # we assume (batch, channel, band, xtrack, wvl) inp_spectrum = inp_spectrum.reshape(inp_spectrum.shape[0], inp_spectrum.shape[1], -1, inp_spectrum.shape[-1]) else: raise Exception('Input spectrum has wrong shape.') if self.get('res', binned) is not None: warnings.warn( 'WARNING: calculates convolution at different resolutions.') if not return_binned and 'unbinned' not in self.params: raise Exception('Unbinned params are not available.') assert self.check_srfs_initialized(binned=binned) assert self.check_inp_spectrum_consistency(inp_spectrum, inp_wvlens, binned=binned) # Determine how many batches per job and prepare run_specs if run_specs is None: run_specs = {} if 'batches_per_job' not in run_specs: batches_per_job = 1000 else: batches_per_job = run_specs['batches_per_job'] run_specs = { k: v for k, v in run_specs.items() if k != 'batches_per_job' } if run_specs_inner is None: run_specs_inner = dict(joblib=False) # broadcast, if only one inp_wvls, assume is same inp_wvls for all inp_spectra in batches if len(inp_wvlens) == 1 and len(inp_spectrum) > 1: inp_wvlens = [inp_wvlens[0]] * len(inp_spectrum) # broadcast, if only one inp_spectrum assume is same for all inp_wvls if len(inp_wvlens) > 1 and len(inp_spectrum) == 1: inp_spectrum = [inp_spectrum[0]] * len(inp_wvlens) # define jobs job_inp_spectra = chunk_list(inp_spectrum, batches_per_job) job_inp_wvls = chunk_list(inp_wvlens, batches_per_job) jobs = [ partial(self._forward, inp_spectrum=inp_s, inp_wvlens=inp_w, binned=binned, part_covered=part_covered, tol=tol, pad=pad, ng4=ng4, invert=invert, snr=snr, dc=dc, smear=smear, return_binned=return_binned, run_specs=run_specs_inner, *args, **kwargs) for inp_s, inp_w in zip(job_inp_spectra, job_inp_wvls) ] # flatten out job dimension such that we have (batch, channel, band, xdir) res, illu_bands = zip(*run_jobs(jobs, **run_specs)) res = list(itertools.chain(*res)) illu_bands = list(itertools.chain(*illu_bands)) return res, illu_bands
print(properties) def furl(url): url = url.replace('[','-5B') url = url.replace(']','-5D') url = url.replace('?','-3F') url = url.replace('=','%3D') return url url_prefix = 'http://neurolex.org/wiki/Special:Ask/[[Category:Entity]]/' url_suffix = '/mainlabel=Categories/format=csv/sep=,/offset={}/limit={}' results = [] result_step = 2500 # see https://www.semantic-mediawiki.org/wiki/Help:Configuration#Query_settings for props in chunk_list(properties, 10): # 20 too long :/ may be able to fix via $smwgQMaxSize which defaults to 12 all_rows = [] for start in range(0, 30001, result_step): # offset limit is fixed via $smwgQMaxLimit in SMW_Settings.php url = url_prefix + '/?'.join(props) + url_suffix.format(start, result_step) # crazy stuff when you leave out the ? try: data = requests.get(furl(url)) except: print('FAILED on URL =', furl(url)) #embed() # data is already defined it will just duplicated the previous block reader = csv.reader(data.text.splitlines()) rows = [r for r in reader] all_rows.extend(rows) results.append(all_rows)
def furl(url): url = url.replace('[', '-5B') url = url.replace(']', '-5D') url = url.replace('?', '-3F') url = url.replace('=', '%3D') return url url_prefix = 'http://neurolex.org/wiki/Special:Ask/[[Category:Entity]]/' url_suffix = '/mainlabel=Categories/format=csv/sep=,/offset={}/limit={}' results = [] result_step = 2500 # see https://www.semantic-mediawiki.org/wiki/Help:Configuration#Query_settings for props in chunk_list( properties, 10 ): # 20 too long :/ may be able to fix via $smwgQMaxSize which defaults to 12 all_rows = [] for start in range( 0, 30001, result_step ): # offset limit is fixed via $smwgQMaxLimit in SMW_Settings.php url = url_prefix + '/?'.join(props) + url_suffix.format( start, result_step) # crazy stuff when you leave out the ? try: data = requests.get(furl(url)) except: print('FAILED on URL =', furl(url)) #embed() # data is already defined it will just duplicated the previous block reader = csv.reader(data.text.splitlines()) rows = [r for r in reader]
def run_detection(self, input_path, generate_bbox_images=True, recursive=True, n_cores=0, results=None, checkpoint_path=None, checkpoint_frequency=-1, electron=False): image_file_names = find_images(input_path, recursive=recursive) print(len(image_file_names)) #flash(len(image_file_names)) if results is None: results = [] already_processed = set([i['file'] for i in results]) gpu_available = True if tf.config.list_physical_devices( 'GPU') else False if n_cores > 1 and gpu_available: logging.warning( 'Multiple cores requested, but a GPU is available; ' 'parallelization across GPUs is not currently ' 'supported, defaulting to one GPU') # If we're not using multiprocessing... if n_cores <= 1 or gpu_available: count = 0 # Does not count those already processed # Note: stylising the bar with custom characters breaks in Electron; need to investigate print("we're in") #flash('innnnn') with click.progressbar(length=len(image_file_names), label='Processing Images', show_pos=True, show_eta=True, show_percent=True, info_sep='|') as bar: for im_file in image_file_names: # Will not add additional entries not in the starter checkpoint if im_file in already_processed: logging.info( f'Bypassing already processed image: {im_file}') continue count += 1 result = self.__process_image(im_file, generate_bbox_images) results.append(result) bar.update(1) # this is for megadetector-gui usage if electron: print(bar.format_progress_line(), flush=True) # checkpoint if checkpoint_frequency != -1 and count % checkpoint_frequency == 0: logging.info( f'Writing a new checkpoint after having ' f'processed {count} images since last restart') with open(checkpoint_path, 'w') as f: json.dump({'images': results}, f) else: # when using multiprocessing, let the workers load the model logging.info(f'Creating pool with {n_cores} cores') if len(already_processed) > 0: logging.warning( 'When using multiprocessing, all images are reprocessed') pool = workerpool(n_cores) image_batches = list(chunk_list(image_file_names, n_cores)) results = pool.map(partial(self.__process_images, image_batches), image_batches, generate_bbox_images) results = list(itertools.chain.from_iterable(results)) self.save(results) return results