Пример #1
0
def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None:
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    log.info("Generating word2vec model for snapshot %d..." % snapshots_id)

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info("Finished generating word2vec model for snapshot %d." %
             snapshots_id)
    def test_train_word2vec_model(self):
        sentence_iterator = SnapshotSentenceIterator(
            db=self.db,
            topics_id=self.topics_id,
            snapshots_id=self.snapshots_id,
            stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE,
        )
        model_store = SnapshotDatabaseModelStore(
            db=self.db,
            topics_id=self.topics_id,
            snapshots_id=self.snapshots_id)

        models_id = train_word2vec_model(sentence_iterator=sentence_iterator,
                                         model_store=model_store)

        model_data = model_store.read_model(models_id=models_id)
        assert model_data is not None
        assert isinstance(model_data, bytes)

        # Save to file, make sure it loads
        temp_directory = tempfile.mkdtemp()
        temp_model_path = os.path.join(temp_directory, 'word2vec.pickle')
        with open(temp_model_path, mode='wb') as temp_model_file:
            temp_model_file.write(model_data)

        word_vectors = gensim.models.KeyedVectors.load_word2vec_format(
            temp_model_path, binary=True)

        assert word_vectors is not None
        assert word_vectors['story'] is not None
        assert word_vectors['sentence'] is not None

        assert 'badger' not in word_vectors

        shutil.rmtree(temp_directory)
 def test_snapshot_sentence_iterator_nonexistent_snapshot(self):
     with pytest.raises(McWord2vecException):
         SnapshotSentenceIterator(
             db=self.db,
             snapshots_id=123456,
             stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE,
         )
def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None:
    """Generate word2vec model for a given snapshot."""

    # MC_REWRITE_TO_PYTHON: remove after Python rewrite
    if isinstance(snapshots_id, bytes):
        snapshots_id = decode_object_from_bytes_if_needed(snapshots_id)

    if snapshots_id is None:
        raise McWord2vecGenerateSnapshotModelException(
            "'snapshots_id' is None.")

    snapshots_id = int(snapshots_id)

    db = connect_to_db()

    # FIXME might be more efficient to pass topics_id as a parameter
    topics_id = db.query(
        """
        SELECT topics_id
        FROM snapshots
        WHERE snapshots_id = %(snapshots_id)s
    """, {
            'snapshots_id': snapshots_id
        }).flat()[0]

    log.info(
        f"Generating word2vec model for topic {topics_id}, snapshot {snapshots_id}..."
    )

    sentence_iterator = SnapshotSentenceIterator(db=db,
                                                 topics_id=topics_id,
                                                 snapshots_id=snapshots_id)
    model_store = SnapshotDatabaseModelStore(db=db,
                                             topics_id=topics_id,
                                             snapshots_id=snapshots_id)
    train_word2vec_model(sentence_iterator=sentence_iterator,
                         model_store=model_store)

    log.info(
        f"Finished generating word2vec model for topic {topics_id}, snapshot {snapshots_id}."
    )
    def test_snapshot_sentence_iterator(self):
        """Ensure that all of the sentences get returned"""

        sentence_iterator = SnapshotSentenceIterator(
            db=self.db,
            snapshots_id=self.snapshots_id,
            stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE,
        )
        returned_sentence_count = 0
        seen_sentences = set()

        for sentence_words in sentence_iterator:
            assert sentence_words, "Sentence words should be set."

            sentence = ' '.join(sentence_words)
            assert sentence not in seen_sentences, "Every sentence should be unique."

            returned_sentence_count += 1
            seen_sentences.add(sentence)

        assert returned_sentence_count == self.TEST_STORY_COUNT * self.TEST_SENTENCE_PER_STORY_COUNT, \
            "All of the sentences should have been returned."