def _load_proteins(subset_genes: Set[GeneLabel], output_csv=False): # Sort genes subset_genes = sorted(frozenset(subset_genes)) if len(subset_genes) > 0: batches = grouper(subset_genes) else: batches = [None] first_chunk = True for batch in batches: try: # Special case for 'all' if batch is None: chunked_data = pd.read_hdf(_PROTEIN_META_HDF, 'protein_meta', chunksize=HDF5_CHUNKSIZE) else: batch = frozenset(batch) chunked_data = pd.read_hdf(_PROTEIN_META_HDF, 'protein_meta', local_variables=dict(batch=batch), where=f'protein in batch', chunksize=HDF5_CHUNKSIZE) for chunk in chunked_data: if not output_csv: for __, row in chunk.iterrows(): yield ProteinInfo.from_row(row) else: yield chunk.to_csv(index=False, header=first_chunk) first_chunk = False except NotImplementedError: raise Exception(sorted(batch))
def import_file(database, path, module, location=None, chunk_size=256): reader = csv_file(path, fields=module.FIELD_NAMES) for chunk in progressbar.progressbar(grouper(reader, chunk_size), max_value=line_count(path) // chunk_size): import_chunk(database, chunk, module, location=location)
def test_grouper(self): data = [1, 2, 3, 4, 5] grouped = grouper(data, 2) self.assertEqual([1, 2], list(next(grouped))) self.assertEqual([3, 4], list(next(grouped))) self.assertEqual([5, None], list(next(grouped))) with self.assertRaises(StopIteration): next(grouped)
def _audio_file_spectrogram(self, _file, nfft, duration, overlap): # first get the spec_params and let the client setup the canvas fs = _file.sample_rate spec_params = get_audio_spectrogram_params(_file, fs, duration, nfft, overlap) self.send_spectrogram_new(spec_params) # now lets compute the spectrogram and send it over data = _file[:spec_params.nsamples].sum(axis=1) for chunk in grouper(data, spec_params.chunksize, spec_params.nfft): chunk = np.array(chunk) spec = spectrogram(chunk, spec_params) self.send_spectrogram_update(spec)
def _audio_file_spectrogram(self, _file, nfft, duration, overlap): # first get the spec_params and let the client setup the canvas fs = _file.sample_rate spec_params = get_audio_spectrogram_params( _file, fs, duration, nfft, overlap) self.send_spectrogram_new(spec_params) # now lets compute the spectrogram and send it over data = _file[:spec_params.nsamples].sum(axis=1) for chunk in grouper(data, spec_params.chunksize, spec_params.nfft): chunk = np.array(chunk) spec = spectrogram(chunk, spec_params) self.send_spectrogram_update(spec)
def load(self, iterable: Collection, batch_size=10000) -> None: written_rows_count = 0 influxdb_records = [] for index, batch in enumerate(grouper(iterable, batch_size)): for record in batch: if record: influxdb_records.append(record) self.influxdb_client.write_points(influxdb_records, batch_size=1000) written_rows_count += len(influxdb_records) influxdb_records = [] logger.info('InfluxDB load: {}/{} records written'.format( written_rows_count, len(iterable)))
def get_buttons_routes(self, user_routes): # TODO: too many buttons routes_list = sorted(list(self.cds.bus_routes.keys()), key=natural_sort_key) routes_groups = list(grouper(8, routes_list)) route_btns = [[InlineKeyboardButton('Hide', callback_data='hide')], [InlineKeyboardButton('All', callback_data='all'), InlineKeyboardButton('None', callback_data='none')] ] + [ [InlineKeyboardButton(f"{x}{'+' if x in user_routes else ''}", callback_data=x) for x in group if x] for group in routes_groups] keyboard = route_btns + [ ] return keyboard
def insert_hashes(self, record_id, hashes): values = [] for hash, offset in hashes: values.append({ 'hash': hash, 'record_id': record_id, 'offset': offset }) rows = [] for split_values in grouper(values, 1000): for row in split_values: rows.append(Fingerprints(**row)) self.session.add_all(rows) self.session.commit()
def return_matches(self, hashes): mapper = {} for hash, offset in hashes: mapper[hash.upper()] = offset values = mapper.keys() for split_values in grouper(values, 1000): records = self.session.query(Fingerprints.hash, Fingerprints.record_id, Fingerprints.offset).filter( Fingerprints.hash.in_( list(split_values))) for row in records.all(): key = bytes("{0}".format(row[0]), encoding="ascii") if key in mapper: yield (row[1], row[2] - mapper[key])
def _load_domains(subset_genes: Set[GeneLabel], columns=None): if not subset_genes: return [] for batch in grouper(subset_genes): try: chunked_data = pd.read_hdf(_ANNOTATION_FILE, 'domains', local_variables=dict(batch=batch), where=f'protein in batch', columns=columns, chunksize=HDF5_CHUNKSIZE) for chunk in chunked_data: for __, row in chunk.iterrows(): yield row.to_dict() except NotImplementedError: raise Exception(sorted(batch))
def _load_edges(subset_genes: Set[GeneLabel], columns=None): if not subset_genes: return [] seen_edges = set() # Since tables crashes with large "in" queries # We will fetch data in batches for batch in grouper(subset_genes, n=BATCH_SIZE_EDGES): # Note that to fetch the batched data correctly we need to fetch # all interactions for the proteins and then post-filter them # to subset try: chunked_data = pd.read_hdf( _NETWORK_FILE, 'edges', local_variables=dict(batch=batch), where=f'protein_a in batch or protein_b in batch', columns=columns, chunksize=HDF5_CHUNKSIZE) for chunk in chunked_data: chunk = chunk.query( 'protein_a in @subset_genes and protein_b in @subset_genes' ) for __, row in chunk.iterrows(): edge = row['protein_a'], row['protein_b'] if edge not in seen_edges: yield row.to_dict() seen_edges.add(edge) except NotImplementedError: raise Exception(sorted(batch))
def _load_subset(subset_genes: Set[GeneLabel], subset_keys=None): if not subset_genes: return [] for batch in grouper(subset_genes): try: chunked_data = pd.read_hdf(_PTM_RESPONSE_FILE, 'ptm_matrix', local_variables=dict(batch=batch), where=f'protein in batch', chunksize=HDF5_CHUNKSIZE) except NotImplementedError: raise Exception(sorted(batch)) for chunk in chunked_data: for __, row in chunk.iterrows(): # NaNs will be undefined d = {} row = row.dropna() for col, value in row.iteritems(): if col == 'protein': d[col] = value continue ptm, __, key = col.partition('-') if subset_keys is not None and key not in subset_keys: continue try: d[ptm][key] = value except KeyError: d[ptm] = {key: value} yield d
def _load_nodes(subset_genes: Set[GeneLabel], columns=None): if not subset_genes: return [] for batch in grouper(subset_genes): # Note that to fetch the batched data correctly we need to fetch # all interactions for the proteins and then post-filter them # to subset try: data = pd.read_hdf(_NETWORK_FILE, 'nodes', local_variables=dict(batch=batch), where=f'protein in batch', columns=columns) except NotImplementedError: raise Exception(sorted(batch)) for __, row in data.iterrows(): # NaNs will be undefine row = row.dropna() yield row.to_dict()
def _load_data(subset_genes: Set[GeneLabel], columns=None): if len(subset_genes) == 0: # Return empty dataframe if subset is specified but empty return [] # Pytables fails somehow when searching for a large subset. for batch in grouper(subset_genes): batch = frozenset(batch) try: chunked_data = pd.read_hdf(_MATRIX_FILE, 'enrichment_data_minimal', local_variables=dict(batch=batch), where=f'{GENE_LABEL_COLUMN} in batch', columns=columns, chunksize=HDF5_CHUNKSIZE) for chunk in chunked_data: yield from chunk.to_dict(orient='records') # for __, row in chunk.iterrows(): # yield row.to_dict() except NotImplementedError: raise Exception(sorted(batch))
def altsplit(triangles): return chain.from_iterable(zip(*group) for group in grouper(triangles, 3))