def calculate_difficulty_values(asset_qs=None): """ Calculate the difficulty scores for the provided AssetQuerySet and update the Asset records for changed difficulty values """ if asset_qs is None: asset_qs = Asset.objects.published() asset_qs = asset_qs.add_contribution_counts() updated_count = 0 # We'll process assets in chunks using an iterator to avoid saving objects # which will never be used again in memory. We will find assets which have a # difficulty value which is not the same as the value stored in the database # and pass them to bulk_update() to be saved in a single query. for asset_chunk in chunked(asset_qs.iterator(), 500): changed_assets = [] for asset in asset_chunk: difficulty = asset.transcription_count * (asset.transcriber_count + asset.reviewer_count) if difficulty != asset.difficulty: asset.difficulty = difficulty changed_assets.append(asset) if changed_assets: # We will only save the new difficulty score both for performance # and to avoid any possibility of race conditions causing stale data # to be saved: Asset.objects.bulk_update(changed_assets, ["difficulty"]) updated_count += len(changed_assets) return updated_count
def populate_asset_years(): """ Pull out date info from raw Item metadata and populate it for each Asset """ asset_qs = Asset.objects.prefetch_related("item") updated_count = 0 for asset_chunk in chunked(asset_qs, 500): changed_assets = [] for asset in asset_chunk: metadata = asset.item.metadata year = None for date_outer in metadata["item"]["dates"]: for date_inner in date_outer.keys(): year = date_inner break # We don't support multiple values if asset.year != year: asset.year = year changed_assets.append(asset) if changed_assets: Asset.objects.bulk_update(changed_assets, ["year"]) updated_count += len(changed_assets) return updated_count
def embed_sentences( self, sentences: List[str], max_chunk_size: int = 10000, partial_save_path: str = None, ) -> np.ndarray: """ Embeds a list of sentences :param sentences: list of sentences to be embedded :param max_chunk_size: size of max size of chunk of sentence to be embedded :return: tensor with embeddings of all input sentences :param partial_save_path: file with partially processed sentences """ logger.debug(f"# of all sentences: {len(sentences)}") embeddings = np.empty((0, 512)) if partial_save_path is not None: partial_save_path = Path(partial_save_path).with_suffix(".npy") if partial_save_path.exists(): embeddings = np.load(partial_save_path.as_posix()) sentences = sentences[embeddings.shape[0]:] logger.debug( f"# of already embedded sentences: {embeddings.shape[0]}.") n_chunks = np.ceil(len(sentences) / max_chunk_size) logger.debug(f"# of all chunks: {n_chunks} embedding iterations.") for sentences_chunk in tqdm( chunked(sentences, max_chunk_size), desc="Embed sentences in chunks", total=n_chunks, ): values, indices, dense_shape = process_to_ids_in_sparse_format( self.sentence_piece_processor, sentences_chunk) embeddings = np.append( embeddings, self.session.run( self.computation, feed_dict={ self.input_placeholder.values: values, self.input_placeholder.indices: indices, self.input_placeholder.dense_shape: dense_shape, }, ), axis=0, ) if partial_save_path is not None: np.save(partial_save_path.as_posix(), embeddings) return embeddings
def grouper_nofill_str(n, iterable): """ Take a sequence and break it up into chunks of the specified size. The last chunk may be smaller than size. This works very similar to grouper_nofill, except it works with strings as well. >>> tuple(grouper_nofill_str(3, 'foobarbaz')) ('foo', 'bar', 'baz') You can still use it on non-strings too if you like. >>> tuple(grouper_nofill_str(42, [])) () >>> tuple(grouper_nofill_str(3, list(range(10)))) ([0, 1, 2], [3, 4, 5], [6, 7, 8], [9]) """ res = more.chunked(iterable, n) if isinstance(iterable, six.string_types): res = (''.join(item) for item in res) return res