Пример #1
0
    def embeddings(dbfile, vectors, maxsize):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file
            vectors: vector path
            maxsize: maximum number of documents to process

        Returns:
            embeddings index
        """

        embeddings = Embeddings({"path": vectors,
                                 "scoring": "bm25",
                                 "pca": 3,
                                 "quantize": True})

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile, maxsize))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile, maxsize))

        return embeddings
Пример #2
0
    def embeddings(dbfile):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file

        Returns:
            embeddings index
        """

        embeddings = Embeddings({
            "path":
            Models.vectorPath("stackexchange-300d.magnitude"),
            "storevectors":
            True,
            "scoring":
            "bm25",
            "pca":
            3,
            "quantize":
            True
        })

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile))

        return embeddings
Пример #3
0
    def testWords(self):
        """
        Test embeddings backed by word vectors
        """

        # Initialize model path
        path = os.path.join(tempfile.gettempdir(), "model")
        os.makedirs(path, exist_ok=True)

        # Build tokens file
        with tempfile.NamedTemporaryFile(mode="w", delete=False) as output:
            tokens = output.name
            for x in self.data:
                output.write(x + "\n")

        # Word vectors path
        vectors = os.path.join(path, "test-300d")

        # Build word vectors, if they don't already exist
        WordVectors.build(tokens, 300, 1, vectors)

        # Create dataset
        data = [(x, row, None) for x, row in enumerate(self.data)]

        # Create embeddings model, backed by word vectors
        embeddings = Embeddings({
            "path": vectors + ".magnitude",
            "storevectors": True,
            "scoring": "bm25",
            "pca": 3,
            "quantize": True
        })

        # Call scoring and index methods
        embeddings.score(data)
        embeddings.index(data)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))

        # Generate temp file path
        index = os.path.join(tempfile.gettempdir(), "wembeddings")

        # Test save/load
        embeddings.save(index)
        embeddings.load(index)

        # Test search
        self.assertIsNotNone(embeddings.search("win", 1))
Пример #4
0
    def train(vector, score):
        """
        Trains an Embeddings model on STS dev + train data.

        Args:
            vector: word vector model path
            score: scoring method (bm25, sif, tfidf or None for averaging)

        Returns:
            trained Embeddings model
        """

        print("Building model")
        embeddings = Embeddings({
            "path": Models.vectorPath(vector),
            "scoring": score,
            "pca": 3
        })

        rows1 = STS.read(Models.testPath("stsbenchmark", "sts-dev.csv"))
        rows2 = STS.read(Models.testPath("stsbenchmark", "sts-train.csv"))

        rows = rows1 + rows2

        documents = []
        for row in rows:
            tokens = Tokenizer.tokenize(row[2] + " " + row[3])

            if tokens:
                documents.append((row[0], tokens, None))
            else:
                print("Skipping all stop word string: ", row)

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(documents)

        # Build embeddings index
        embeddings.index(documents)

        return embeddings
Пример #5
0
    def embeddings(dbfile, vectors, maxsize):
        """
        Builds a sentence embeddings index.

        Args:
            dbfile: input SQLite file
            vectors: path to vectors file or configuration
            maxsize: maximum number of documents to process

        Returns:
            embeddings index
        """

        # Read config and create Embeddings instance
        embeddings = Embeddings(Index.config(vectors))

        # Build scoring index if scoring method provided
        if embeddings.config.get("scoring"):
            embeddings.score(Index.stream(dbfile, maxsize))

        # Build embeddings index
        embeddings.index(Index.stream(dbfile, maxsize))

        return embeddings