Пример #1
0
def calculate_difficulty_values(asset_qs=None):
    """
    Calculate the difficulty scores for the provided AssetQuerySet and update
    the Asset records for changed difficulty values
    """

    if asset_qs is None:
        asset_qs = Asset.objects.published()

    asset_qs = asset_qs.add_contribution_counts()

    updated_count = 0

    # We'll process assets in chunks using an iterator to avoid saving objects
    # which will never be used again in memory. We will find assets which have a
    # difficulty value which is not the same as the value stored in the database
    # and pass them to bulk_update() to be saved in a single query.
    for asset_chunk in chunked(asset_qs.iterator(), 500):
        changed_assets = []

        for asset in asset_chunk:
            difficulty = asset.transcription_count * (asset.transcriber_count +
                                                      asset.reviewer_count)
            if difficulty != asset.difficulty:
                asset.difficulty = difficulty
                changed_assets.append(asset)

        if changed_assets:
            # We will only save the new difficulty score both for performance
            # and to avoid any possibility of race conditions causing stale data
            # to be saved:
            Asset.objects.bulk_update(changed_assets, ["difficulty"])
            updated_count += len(changed_assets)

    return updated_count
Пример #2
0
def populate_asset_years():
    """
    Pull out date info from raw Item metadata and populate it for each Asset
    """

    asset_qs = Asset.objects.prefetch_related("item")

    updated_count = 0

    for asset_chunk in chunked(asset_qs, 500):
        changed_assets = []

        for asset in asset_chunk:
            metadata = asset.item.metadata

            year = None
            for date_outer in metadata["item"]["dates"]:
                for date_inner in date_outer.keys():
                    year = date_inner
                    break  # We don't support multiple values

            if asset.year != year:
                asset.year = year
                changed_assets.append(asset)

        if changed_assets:
            Asset.objects.bulk_update(changed_assets, ["year"])
            updated_count += len(changed_assets)

    return updated_count
    def embed_sentences(
        self,
        sentences: List[str],
        max_chunk_size: int = 10000,
        partial_save_path: str = None,
    ) -> np.ndarray:
        """
        Embeds a list of sentences

        :param sentences: list of sentences to be embedded
        :param max_chunk_size: size of max size of chunk of sentence to be embedded
        :return: tensor with embeddings of all input sentences
        :param partial_save_path: file with partially processed sentences
        """
        logger.debug(f"# of all sentences: {len(sentences)}")
        embeddings = np.empty((0, 512))

        if partial_save_path is not None:
            partial_save_path = Path(partial_save_path).with_suffix(".npy")
            if partial_save_path.exists():
                embeddings = np.load(partial_save_path.as_posix())
                sentences = sentences[embeddings.shape[0]:]
                logger.debug(
                    f"# of already embedded sentences: {embeddings.shape[0]}.")

        n_chunks = np.ceil(len(sentences) / max_chunk_size)
        logger.debug(f"# of all chunks: {n_chunks} embedding iterations.")

        for sentences_chunk in tqdm(
                chunked(sentences, max_chunk_size),
                desc="Embed sentences in chunks",
                total=n_chunks,
        ):
            values, indices, dense_shape = process_to_ids_in_sparse_format(
                self.sentence_piece_processor, sentences_chunk)

            embeddings = np.append(
                embeddings,
                self.session.run(
                    self.computation,
                    feed_dict={
                        self.input_placeholder.values: values,
                        self.input_placeholder.indices: indices,
                        self.input_placeholder.dense_shape: dense_shape,
                    },
                ),
                axis=0,
            )

            if partial_save_path is not None:
                np.save(partial_save_path.as_posix(), embeddings)

        return embeddings
Пример #4
0
def grouper_nofill_str(n, iterable):
	"""
	Take a sequence and break it up into chunks of the specified size.
	The last chunk may be smaller than size.

	This works very similar to grouper_nofill, except
	it works with strings as well.

	>>> tuple(grouper_nofill_str(3, 'foobarbaz'))
	('foo', 'bar', 'baz')

	You can still use it on non-strings too if you like.

	>>> tuple(grouper_nofill_str(42, []))
	()

	>>> tuple(grouper_nofill_str(3, list(range(10))))
	([0, 1, 2], [3, 4, 5], [6, 7, 8], [9])
	"""
	res = more.chunked(iterable, n)
	if isinstance(iterable, six.string_types):
		res = (''.join(item) for item in res)
	return res
Пример #5
0
def grouper_nofill_str(n, iterable):
    """
	Take a sequence and break it up into chunks of the specified size.
	The last chunk may be smaller than size.

	This works very similar to grouper_nofill, except
	it works with strings as well.

	>>> tuple(grouper_nofill_str(3, 'foobarbaz'))
	('foo', 'bar', 'baz')

	You can still use it on non-strings too if you like.

	>>> tuple(grouper_nofill_str(42, []))
	()

	>>> tuple(grouper_nofill_str(3, list(range(10))))
	([0, 1, 2], [3, 4, 5], [6, 7, 8], [9])
	"""
    res = more.chunked(iterable, n)
    if isinstance(iterable, six.string_types):
        res = (''.join(item) for item in res)
    return res