def chunks(n, join=None, *, seq): if join is None: yield from fy.chunks(n, seq) else: if isinstance(join, str): join = join.join for item in fy.chunks(n, seq): yield join(item)
def test_input_batch_generator(self): # Throw in a few empty sentences tokenized_sentences = [["abab", "aaaa", "b", "ab", "."], [], []] batch_size = 2 # We're expecting batches of size batch_size, in the order in which sentences are provided sentence_index_batches = funcy.chunks(batch_size, range(len(tokenized_sentences))) expected_generated_inputs = [] # Compute what the generated input should be for sentence_index_batch in sentence_index_batches: tokenized_sentence_batch = [ tokenized_sentences[index] for index in sentence_index_batch ] inputs_batch = self.input_encoder.prepare_inputs_from_pretokenized( tokenized_sentence_batch) expected_generated_inputs.append(inputs_batch) predict_generator = self.input_encoder.input_batch_generator( tokenized_sentences, batch_size) # verify the output of the generator for expected_inputs, inputs in zip(expected_generated_inputs, predict_generator): for key in expected_inputs: np.testing.assert_equal(expected_inputs[key], inputs[key]) # now all empty sentences # test with emtpy sentences tokenized_sentences = [[]] batch_size = 4 # We're expecting batches of size batch_size, in the order in which sentences are provided sentence_index_batches = funcy.chunks(batch_size, range(len(tokenized_sentences))) expected_generated_inputs = [] # Compute what the generated input should be for sentence_index_batch in sentence_index_batches: tokenized_sentence_batch = [ tokenized_sentences[index] for index in sentence_index_batch ] inputs_batch = self.input_encoder.prepare_inputs_from_pretokenized( tokenized_sentence_batch) expected_generated_inputs.append(inputs_batch) predict_generator = self.input_encoder.input_batch_generator( tokenized_sentences, batch_size) # verify the output of the generator for expected_inputs, inputs in zip(expected_generated_inputs, predict_generator): for key in expected_inputs: np.testing.assert_equal(expected_inputs[key], inputs[key])
def generate_random_images( tmp_directory, video_file, size=None, aspect_ratio=(16, 9), **kwargs): """ Generate uniformly distributed random snapshots from a video.""" from moviepy import editor clip = editor.VideoFileClip(video_file) if int(clip.duration) < 1: return 0 # hard-coded steps of snapshot smoothness if clip.duration < 30: num_of_chunks = 3 elif clip.duration < 60: num_of_chunks = 5 elif clip.duration < 60 * 5: num_of_chunks = 20 elif clip.duration < 60 * 10: num_of_chunks = 30 else: num_of_chunks = 50 chunk_size = int(clip.duration // num_of_chunks) for i, video_chunk in enumerate(chunks(chunk_size, range(0, int(clip.duration)))): random_frame_time = random.uniform(first(video_chunk), last(video_chunk)) img = Image.fromarray(clip.get_frame(random_frame_time)) if size: img = img_resize(img, size=size, aspect_ratio=aspect_ratio) file_name = f'{i}.{kwargs.get("output_ext", "png")}' img_save(img, str(tmp_directory / 'random' / file_name)) return num_of_chunks
def list_pipelines(selection: List[str]) -> List[DataPipeline]: """ Return list of pipelines related to this project (which must have the tag for our project set). The :selection should be a list of glob patterns to select specific pipelines by their ID. If the selection is an empty list, then all pipelines are used. """ client = boto3.client("datapipeline") paginator = client.get_paginator("list_pipelines") response_iterator = paginator.paginate() all_pipeline_ids = response_iterator.search("pipelineIdList[].id") if selection: selected_pipeline_ids = [ pipeline_id for pipeline_id in all_pipeline_ids for glob in selection if fnmatch.fnmatch(pipeline_id, glob) ] else: selected_pipeline_ids = list(all_pipeline_ids) dw_pipelines = [] chunk_size = 25 # Per AWS documentation, need to go in pages of 25 pipelines for ids_chunk in funcy.chunks(chunk_size, selected_pipeline_ids): resp = client.describe_pipelines(pipelineIds=ids_chunk) for description in resp["pipelineDescriptionList"]: for tag in description["tags"]: if tag["key"] == "user:project" and tag[ "value"] == "data-warehouse": dw_pipelines.append(DataPipeline(description)) return sorted(dw_pipelines, key=attrgetter("name"))
def decryptFile(self, fileIn, chunksize): try: fileOut = fileIn + ".tmp" print(bcolors.WARNING + "[!] " + bcolors.ENDC + "Decrypting file: " + fileIn) with open(fileIn, "rb") as encryptedFile: encrypted = base64.b64decode(encryptedFile.read(64)) setup = encrypted[: 48] # READ KEY[32] and IV[16] = 32 + 16 = 48 | Hent key og IV if self.key == setup[:32]: print(bcolors.OKGREEN + "[+] Password correct!" + bcolors.ENDC) else: print(bcolors.FAIL + "[!] WRONG PASSWORD" + bcolors.ENDC) sys.exit(0) iv = setup[32:] cipher = AES.new(self.key, AES.MODE_CBC, iv) with open(fileOut, "wb") as decryptedFile: encrypted = base64.b64decode(encryptedFile.read()) chunks = list(funcy.chunks(chunksize, encrypted)) for chunk in chunks: decrypted_chunk = self._unpad(cipher.decrypt(chunk)) decryptedFile.write(decrypted_chunk) os.rename(fileOut, fileIn) print(bcolors.OKGREEN + "[+] " + bcolors.ENDC + "Decrypting done!") except Exception as e: raise e
def get_batch_dataframe(df, batch_size=100): """ split DataFrame to sub-DataDrame and each sub-DataDrame row size is batch_size Parameters ---------- df : Pandas DataFrame batch_size : number of records in each sub-dataframe(default: 100) Returns ------- DataFrame generator Examples -------- >>> import pandas as pd >>> from tidyframe import get_batch_dataframe >>> df = pd.DataFrame() >>> df['col_1'] = list("abcde") >>> df['col_2'] = [1, 2, 3, 4, 5] >>> dfs = [ x for x in get_batch_dataframe(df,2)] >>> dfs[-1] col_1 col_2 4 e 5 >>> [ x.shape[0] for x in dfs] [2, 2, 1] """ for min_batch in chunks(batch_size, range(df.shape[0])): yield df.iloc[min_batch, :]
def _series_chunks(s, n_jobs): if n_jobs < 0: # have n chunks if we are using all n cores/cpus = cpu_count() + 1 + n_jobs n_jobs = cpu_count() + 1 + n_jobs n = len(s) n_chunks = int(n / n_jobs) return (s.iloc[ilocs] for ilocs in fp.chunks(n_chunks, range(n)))
def decrypt_file(self): with open(self.input_file, "rb") as encryptedFile: chunk_size = 24 * 1024 encrypted = base64.b64decode(encryptedFile.read(64)) setup = encrypted[:48] # key_confirm = input("Please enter the key used to encrypt the file:- ") salt = b'\x9aX\x10\xa6^\x1fUVu\xc0\xa2\xc8\xff\xceOV' key_check = Crypto.Protocol.KDF.PBKDF2(password=self.secret_key, salt=salt, dkLen=32, count=10000) def unpad(s): return s[:-ord(s[len(s) - 1:])] if key_check == setup[:32]: print("Password Correct!") else: print("Wrong Password!") sys.exit(0) iv = setup[32:] cipher = AES.new(key_check, AES.MODE_CBC, iv) with open(self.output_file, "wb") as decryptedFile: encrypted = base64.b64decode(encryptedFile.read()) chunks = list(funcy.chunks(chunk_size, encrypted)) for chunk in chunks: decrypted_chunk = unpad(cipher.decrypt(chunk)) decryptedFile.write(decrypted_chunk)
def tqdm_chunks(collection, chunk_size, enum=False): """Call funcy.chunks and return the resulting generator wrapped in a progress bar.""" tqdm_nice = tqdm_notebook if in_notebook() else tqdm chunks = funcy.chunks(chunk_size, collection) if enum: chunks = enumerate(chunks) return tqdm_nice(chunks, total=int(np.ceil(len(collection) / chunk_size)))
def split_into_groups(randomized_primary_team, randomized_guest_team): # TODO: this is f****d, make two lists and concat them together instead total_attendees = len(randomized_primary_team) + len(randomized_guest_team) if total_attendees % 5 == 0: group_size = 5 elif total_attendees % 6 == 0: group_size = 6 elif total_attendees % 7 == 0: group_size = 7 elif total_attendees % 8 == 0: group_size = 8 elif total_attendees % 9 == 0: group_size = 9 else: group_size = 6 chunked_list = fn.chunks(group_size, randomized_primary_team) for i in range(len(randomized_guest_team)): chunked_list[i % len(chunked_list)].append(randomized_guest_team.pop()) chunked_list_last_child = chunked_list[-1] if len(chunked_list_last_child) < 5: for i in range(len(chunked_list_last_child)): chunked_list[i % len(chunked_list)].append(chunked_list_last_child.pop()) del chunked_list[-1] return chunked_list
def sendSNIPayload(cmd, argument): """ we know which server command will be executed at this point so we call the 'execute cmd' and encode the command output """ randy = randomString() print("(*) Executing: %s command" % cmd) if not ("CD" or "EX") in cmd: payload = (executeCmd(cmd, 0)) else: payload = argument.encode('utf-8') encoded_payload = str(base64.b32encode(payload), "utf-8") if log_enabled: print(encoded_payload) encoded_payload = encoded_payload.replace("=", '') chunks = list(funcy.chunks(240, encoded_payload)) finito = ("finito-%s" % randy) chunks.append(finito) if log_enabled: print(encoded_payload) print(chunks) sendSNIChunks(chunks)
def input_batches_from_raw_text(self, raw_text, batch_size=32): """Construct ContextualizedEmbedding inputs from a raw text string. Args: raw_text: str, the entire raw document batch_size: number of sentences per batch; limited by GPU memory Returns: generator: yields dicts of inputs to the ContextualizedEmbedding layer 'forward_input': padded array of character codes corresponding to each sentence 'backward_input': padded array of character codes in reverse order 'forward_index_input': padded array of locations of token outputs in forward_input 'backward_index_input': padded array of locations of token outputs in backward_input 'forward_mask_input': mask of same shape as forward_index_input, with 0's where padded and 1's where real tokens 'backward_mask_input':mask of same shape as back_index_input, with 0's where padded and 1's where real tokens num_batches: int, necessary because after enforcing sentence limits we may end up with more sentences than we expect document_index_batches: list of lists of (start, end) tuples indicating spans where each token came from in raw_text. Outer list over batches, inner list over sentences """ parsed_text = split_sentences_and_tokenize_raw_text( raw_text, self.max_token_sequence_len, self.max_char_sequence_len - 2) # -2 for special start and end characters generator = self.input_batch_generator(parsed_text["tokens"], batch_size) num_batches = int(np.ceil(len(parsed_text["tokens"]) / batch_size)) document_indices = create_document_indices_from_sentence_indices( parsed_text["spans"], parsed_text["tokens"], raw_text) # now group document_indices into batches document_index_batches = list(chunks(batch_size, document_indices)) return generator, num_batches, document_index_batches
def splitscrape(text): sent_corpus = [s for s in nltk.sent_tokenize(text)] sent_corpus = [ "".join(chunk) for chunk in chunks(int(len(sent_corpus) / 100), sent_corpus) ] plotlist = [wordshifter(x) for x in sent_corpus] return (plotlist)
def simulate(self, seed: int = None, copolicy=False): """ Generates tuples of (state, action, next_state) and the probability transitioning from state to next_state. """ bits = self._simulate(seed, copolicy) chunks = fn.chunks(self.spec.order.total_bits, bits) return [self.spec.unflatten(c)[0] for c in chunks]
def decode_plain(self, text): if len(text) % 2: raise ValueError("Text is not playfair-encoded: Length is odd") for a, b in chunks(2, text): if a == b: raise ValueError("Text is not playfair-encoded:" + " Invalid bigram {}".format(a + b)) yield self._code_pair(a, b, -1)
def _mygene_fetch(queries, scopes, specie): # To retry or ignore only one chunk on error @ignore(requests.HTTPError, default=[]) @log_errors(lambda msg: cprint(msg, 'red'), stack=False) @retry(10, errors=requests.HTTPError, timeout=lambda n: 5 * 1.4**n) @log_errors(lambda msg: cprint(msg, 'yellow'), stack=False) def querymany(qs): try: return mg.querymany(qs, scopes=scopes, fields=['entrezgene', 'symbol'], species=specie, email='*****@*****.**', verbose=False) except requests.HTTPError as e: # Do not retry on Bad Request if e.response.status_code == 400: return [] raise cprint('> Going to query %d genes in %s...' % (len(queries), scopes), 'cyan') cprint('> sample queries: %s' % ', '.join(take(8, queries)), 'cyan') # Read cache prefix = '%s-%s:' % (SPECIE_PREFIXES[specie], PREFIXES[scopes]) keys = [prefix + q for q in queries] res = { k: pickle.loads(v) if v else '' for k, v in zip(queries, mget(keys)) if v is not None } if res: queries = set(queries) - set(res) print( ('Got %d from cache, %d queries left' % (len(res), len(queries)))) if queries: mg = mygene.MyGeneInfo() # Looks like sorting groups bad queries data = cat( querymany(qs) for qs in chunks(500, tqdm(sorted(queries), leave=False))) new = { str(item['query']): (item['entrezgene'], item['symbol']) for item in data if not item.get('notfound') and 'entrezgene' in item and 'symbol' in item } res.update(new) # Cache results and fails pipe = redis_client.pipeline(transaction=False) for k, v in new.items(): pipe.setex(prefix + k, CACHE_TIMEOUT, pickle.dumps(v, -1)) for k in queries - set(new): pipe.setex(prefix + k, CACHE_TIMEOUT, '') pipe.execute() res = {k: v for k, v in res.items() if v != ''} cprint('-> Got %d matches' % len(res), 'yellow') return res
def iter_by_ids(self, path, ids, page_size, **params): for ids_chunk in chunks(page_size, ids): data = self.get( path, ids=','.join(map(str, ids_chunk)), page_size=page_size, **params, ) yield from data['results']
def mget(self, keys: List[K]) -> List[Optional[V]]: """Get values for keys""" results = [] for key_chunk in funcy.chunks(self._key_chunk_size, keys): encoded_keys = [self._key_encoder(key) for key in key_chunk] results.extend( self._value_decoder(encoded_data ) if encoded_data is not None else None for encoded_data in self._redis.mget(encoded_keys)) return results
def sync_data(self): table = self.TABLE url = self.get_url() for handle in self.config.get('handles'): params = { 'user_id': handle, 'count': 200, 'include_rts': 1, } has_more = True max_id = None while has_more: last_max_id = max_id result = self.client.make_request(url, self.API_METHOD, params=params) data = self.get_stream_data(result) with singer.metrics.record_counter(endpoint=table) as counter: for index, obj in enumerate(data): LOGGER.debug("On {} of {}".format(index, len(data))) processed = self.filter_keys(obj) singer.write_records(table, [processed]) counter.increment() if max_id is None: max_id = obj.get('id') else: max_id = min(max_id, obj.get('id')) params['max_id'] = max_id self.state = incorporate( self.state, "tweet_engagements.{}".format(obj.get('id')), 'date', processed.get('created_at')) for substream in self.substreams: substream.state = self.state for tweets in funcy.chunks(25, data): substream.sync_data(parent_ids=[ tweet.get('id_str') for tweet in tweets ]) if last_max_id == max_id: has_more = False
def map_backtranslate(): """""" raise NotImplementedError() from multiprocessing import Pool import funcy path = untar_data(URLs.IMDB) txt_files = glob.glob(f'{path}/train/*/*.txt') pool = Pool(8) chunks = funcy.chunks(1000, txt_files) pool.map(save_backtranslations, list(chunks))
def _unflatten(self, bits: Bits): size = self.order.total_bits for i, chunk in enumerate(fn.chunks(self.order.total_bits, bits)): mapping = {} for j, bit in enumerate(chunk): lvl = i*size + j var = self.bexpr.bdd.var_at_level(lvl) name, _, idx = TIMED_INPUT_MATCHER.match(var).groups() mapping[f'{name}[{idx}]'] = bit yield self.dyn.imap.unblast(mapping)
def objects(self, sphere, evaluate_expressions=False): chunk_size = 25 # Per AWS documentation, need to go in pages of 25 objects object_ids = self.instance_ids(sphere) paginator = self.client.get_paginator("describe_objects") # Evaluation fails for components so block the flag here. evaluate_expressions = evaluate_expressions and sphere in ("ATTEMPT", "INSTANCE") for ids_chunk in funcy.chunks(chunk_size, object_ids): response_iterator = paginator.paginate( pipelineId=self.pipeline_id, objectIds=ids_chunk, evaluateExpressions=evaluate_expressions ) for pipeline_object in response_iterator.search("pipelineObjects[]"): yield DataPipelineObject(pipeline_object)
def splitscrape(text): sent_corpus = [] for s in nltk.sent_tokenize(text): sent_corpus.append(s) sent_corpus = [ "".join(chunk) for chunk in chunks(int(len(sent_corpus) / 100), sent_corpus) ] plotlist = [] for x in sent_corpus: plotlist.append(wordshifter(x)) return (plotlist)
def _get_features(self, texts, max_min_features=False, chunk_size=10, verbose=False): text_features = [] _iter = funcy.chunks(chunk_size, texts) if verbose: _iter = tqdm.tqdm(_iter, total=int(np.ceil(len(texts) / chunk_size))) for t in _iter: features = np.array(self.feature_pipeline.transform(t)) feature_types = [features.mean(axis=1)] if max_min_features: feature_types = feature_types + [features.max(axis=1), features.min(axis=1)] concat_features = np.hstack(feature_types) text_features.append(concat_features) return np.vstack(text_features)
def word_tokenize(self, text): """Get list of string tokens from input string. Args: text: input string for tokenization Yields: token: str, non-whitespace tokens """ for token in split_possessive_markers(split_contractions(_html_tokenize(text))): if self._max_characters_per_token is not None: for token_chunk in funcy.chunks(self._max_characters_per_token, token): yield token_chunk else: yield token
def get_artist_info(df_master, username, token, sp): """ Get artist info for every song in df_master Args: df_master (DataFrame): DF returned from get_album_info() username (str): string of spotify username token (str): Spotify api session token sp: (Spotify object): Spotify API session object Returns: artists_df (Pandas DataFrame): df with all artist metadata/features df_master (Pandas DataFrame): df_master merged with artists_df/features """ if token: all_artists = dict() # iterate in batches for artist_id_batch in chunks(20, df_master["artist_id"].unique()): try: batch_artists = sp.artists(artist_id_batch) batch_artists = batch_artists["artists"] for i, artist_id in enumerate(artist_id_batch): # get only attributes that are needed all_artists[artist_id] = { "artist_followers": batch_artists[i]["followers"]["total"], "artist_genres": batch_artists[i]["genres"], "artist_popularity": batch_artists[i]["popularity"], } except AttributeError: logger.info("ERROR AT {}".format(artist_id_batch)) # create df of artists data artists_df = pd.DataFrame(all_artists).T artists_df["artist_genres"] = artists_df.artist_genres.apply( lambda x: [i.replace(" ", "_") for i in x] ) ####################################################################################################### # merge master df with artists df df_master = df_master.join(artists_df, on="artist_id") return artists_df, df_master else: logging.info(f"Can't get token for {username}")
def upgrade(): ### commands auto generated by Alembic - please adjust! ### table = op.create_table('free_email_providers', sa.Column('id', sa.Integer(), nullable=False), sa.Column('domain', sa.Unicode(length=255), nullable=False), sa.PrimaryKeyConstraint('id') ) op.create_index(op.f('ix_free_email_providers_domain'), 'free_email_providers', ['domain'], unique=False) ### end Alembic commands ### with open(os.path.join(PROJECT_ROOT, 'free.emails'), 'r') as f: FREE_EMAILS_SET = set(r.strip() for r in f.readlines()) # Fill the data for domains in chunks(1000, FREE_EMAILS_SET): op.bulk_insert(table, [{'domain': d} for d in domains])
def score_pat(sub, patient): sub_row = sub.loc[patient] truth_rows = st1_test_df[st1_test_df.patientId == patient].dropna() truth_arr = truth_rows[['x', 'y', 'width', 'height']].values if isinstance(sub_row, np.float): if truth_rows.empty: return np.nan else: return 0 elif truth_rows.empty: return 0 else: bboxes = list(funcy.chunks(5, lmap(float, sub_row.strip().split(' ')))) scores = np.array(bboxes)[:, 0] arr = np.array(bboxes)[:, 1:] return map_iou(truth_arr, arr, scores)
def get_1_or_2(x, cut1, cut2): """Unused""" if len(x) == 0: return np.nan new_dets = [] x = lmap(float, x) chunks = list(funcy.chunks(5, x)) for i, c in enumerate(chunks): if c[0] > cut1: new_dets.append(c) elif (i == 1) and c[0] > cut2: assert len(new_dets) == 1 new_dets.append(c) else: break return ' '.join(list(funcy.flatten(new_dets)))
def binary_write(differences, features_number, output_file): chunking_time_logger = TimeLogger(task_name='Chunking') differences = differences.flatten('F') differences = np.append(differences, features_number) differences = struct.pack('=%df' % differences.size, *differences) chunk_size = 10000000 difference_chunks = funcy.chunks(chunk_size, differences) chunking_time_logger.finish() chunk_counter = 1 for difference_chunk in difference_chunks: with open(output_file, 'ab') as f: difference_chunk_time_logger = TimeLogger(task_name='Write difference %d-th chunk' % chunk_counter) f.write(difference_chunk) difference_chunk_time_logger.finish() chunk_counter += 1
def parse_page(text): tree = html.fromstring(text) for item in tree.xpath('//*[@itemtype="https://schema.org/Product"]'): cls = classer(item) link = item.cssselect('.eventlink a')[0].get('href') kwargs = { 'vendor': Vendor.STRANAVETROV, 'level': LEVELS_MAP[cls('stangeevent').lower()], 'url': 'https://stranavetrov.ru' + link, 'title': cls('eventname'), 'length': int_or_none(cls('kmevent')), 'price': cls('eventnewprice') or cls('eventprice', None), } dates = chunks(2, map(parse_date, find_dates(cls('dataevent')))) for start, end in dates: yield Item(start=start, end=end, **kwargs)
def url_info(self, domains, *categories): """ Wrapper over `AwisApi.url_info` for parallel processing. """ if self.closed: raise AttributeError('Session is closed') CHUNK_SIZE = 5 # AWIS`s limit def request(domains, categories): logger.info('AWIS request for %s', domains) return self.api.url_info(domains, *categories) with ThreadPoolExecutor(self.workers_count) as executor: tasks = [ executor.submit(request, chunk, categories) for chunk in chunks(CHUNK_SIZE, self.handle_cache(domains)) ] for future in as_completed(tasks): self.session_list_of_raw_result.append(future.result())
def update_users(self, users_data, prefix=None): """ Uses Intercom's bulk update. """ CHUNK_SIZE = 50 # Intercom's limitation @requests_retry def request(chunk_of_users_data): url = '{0}/bulk/users'.format(self.base_url) response = session.post( url, json={'items': [ { 'method': 'post', 'data_type': 'user', 'data': apply_prefix_for_user_data(ch, prefix) } for ch in chunk_of_users_data] }, auth=self.auth, headers=self.get_headers(), timeout=TIMEOUT) # TODO: re-raise custom exception for 429 HTTP error # for further handling (e.g. retry celery task) response.raise_for_status() result = response.json() try: status_url = result['links']['self'] logger.debug('Bulk update status: %s', status_url) except KeyError: logger.error('Weird response from Intercom: %r', result) return result with self.get_executor() as executor: for _ in executor.map(request, chunks(CHUNK_SIZE, users_data)): pass
def path_chunks(tree): """Return an iterable of worker-sized iterables of paths.""" return chunks(500, unignored(tree.source_folder, tree.ignore_paths, tree.ignore_filenames))