Exemplo n.º 1
0
 def add_word_to_stringstore(self, word, path):
     try:
         self.stringstore = StringStore().from_disk(path)
         self.stringstore.add(word)
     except:
         self.stringstore = StringStore(word)
     self.stringstore.to_disk(path)
Exemplo n.º 2
0
    def __init__(
            self,
            shape: tuple = (1000, 128),
            strings: StringStore = None,
            senses: List[str] = [],
            vectors_name: str = "sense2vec",
            overrides: Dict[str, str] = SimpleFrozenDict(),
    ):
        """Initialize the Sense2Vec object.

        shape (tuple): The vector shape.
        strings (StringStore): Optional string store. Will be created if it
            doesn't exist.
        senses (list): Optional list of all available senses. Used in methods
            that generate the best sense or other senses.
        vectors_name (unicode): Optional name to assign to the Vectors object.
        overrides (dict): Optional custom functions to use, mapped to names
            registered via the registry, e.g. {"make_key": "custom_make_key"}.
        RETURNS (Sense2Vec): The newly constructed object.
        """
        self.vectors = Vectors(shape=shape, name=vectors_name)
        self._row2key = None
        self.strings = StringStore() if strings is None else strings
        self.freqs: Dict[int, int] = {}
        self.cache = None
        self.cfg: Dict[str, Any] = {
            "senses": senses,
            "make_key": "default",
            "split_key": "default",
        }
        self.cfg.update(overrides)
Exemplo n.º 3
0
def test_dump_load(sstore):
    id_ = sstore[u'qqqqq']
    with tempfile.TemporaryFile('w+t') as file_: 
        sstore.dump(file_)
        file_.seek(0)
        new_store = StringStore()
        new_store.load(file_)
    assert new_store[id_] == u'qqqqq'
Exemplo n.º 4
0
def test_dump_load(sstore):
    id_ = sstore[u'qqqqq']
    with tempfile.TemporaryFile('w+t') as file_:
        sstore.dump(file_)
        file_.seek(0)
        new_store = StringStore()
        new_store.load(file_)
    assert new_store[id_] == u'qqqqq'
Exemplo n.º 5
0
 def add_stringstore_to_vocab_temporarely(self, file):
     try:
         self.stringstore = StringStore().from_disk(file)
         for word in self.stringstore:
             lex = self.nlp.vocab[word]
             self.nlp.vocab[word].is_oov = False
     except:
         print("cannot read stringstore in file " + file)
Exemplo n.º 6
0
def test_dump_load(sstore):
    id_ = sstore[u'qqqqq']
    loc = '/tmp/sstore.json'
    with io.open(loc, 'w', encoding='utf8') as file_:
        sstore.dump(file_)
    new_store = StringStore()
    with io.open(loc, 'r', encoding='utf8') as file_:
        new_store.load(file_)
    assert new_store[id_] == u'qqqqq'
Exemplo n.º 7
0
    def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a bytestring.

        bytes_data (bytes): The data to load.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        data = srsly.msgpack_loads(bytes_data)
        self.vectors = Vectors().from_bytes(data["vectors"])
        self.freqs = dict(data.get("freqs", []))
        self.cfg.update(data.get("cfg", {}))
        if "strings" not in exclude and "strings" in data:
            self.strings = StringStore().from_bytes(data["strings"])
        return self
Exemplo n.º 8
0
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size,
            min_ngram, max_ngram):
    logging.info('Processing batch_id: {}'.format(batch_id))
    subtrees = PreshCounter()
    subtrees_string_map = StringStore()
    noun_chunks = PreshCounter()
    noun_chunks_string_map = StringStore()

    if lang.lower() == "en":
        from spacy.en import English
        NLU = English()
        NLU.matcher = None
    elif lang.lower() == "id":
        from spacy.id import Indonesian
        NLU = Indonesian()
        NLU.matcher = None

    for i, doc in enumerate(
            NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)):
        phrases = set()
        for tok in doc:
            st_len = len(list(tok.subtree))
            if min_ngram <= st_len <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws)
                              for t in tok.subtree]).strip()
                orth = subtrees_string_map[st]
                subtrees.inc(orth, 1)
        for np in doc.noun_chunks:
            if min_ngram <= len(np) <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws) for t in np]).strip()
                orth = noun_chunks_string_map[st]
                noun_chunks.inc(orth, 1)

        if i % batch_size == 0:
            logging.info('Processing batch_id: {}, doc: {}'.format(
                batch_id, i))

    output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in subtrees:
            st = subtrees_string_map[orth]
            if count >= 5 and '!LONGWORD!' not in st:
                out.write('{}\t{}\n'.format(count, st))

    output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in noun_chunks:
            if count >= 5:
                st = noun_chunks_string_map[orth]
                out.write('{}\t{}\n'.format(count, st))
Exemplo n.º 9
0
def build_doc_term_matrix(terms_lists, weighted=True):
    '''
    Construct a sparse document/term matrix, optionally weighted by the position of the terms in each document (i.e. in a SERP)
    :param terms_lists: list of urls
    :param weighted: weight by rank
    :return: sparse matrix of urls
    '''
    stringstore = StringStore()

    data = []
    rows = []
    cols = []
    for row_idx, terms_list in enumerate(terms_lists):
        bow = tuple((stringstore[term] - 1, 1. / (i**2 + 2) if weighted else 1)
                    for i, term in enumerate(terms_list) if term)

        data.extend(count for _, count in bow)
        cols.extend(term_id for term_id, _ in bow)
        rows.extend(itertools.repeat(row_idx, times=len(bow)))

    #import pdb;pdb.set_trace()
    doc_term_matrix = sp.coo_matrix((data, (rows, cols)),
                                    dtype=float if weighted else int).tocsr()

    return doc_term_matrix
Exemplo n.º 10
0
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "strings1"
        file_path2 = d / "strings2"
        sstore1.to_disk(file_path1)
        sstore2.to_disk(file_path2)
        sstore1_d = StringStore().from_disk(file_path1)
        sstore2_d = StringStore().from_disk(file_path2)
        assert list(sstore1_d) == list(sstore1)
        assert list(sstore2_d) == list(sstore2)
        if strings1 == strings2:
            assert list(sstore1_d) == list(sstore2_d)
        else:
            assert list(sstore1_d) != list(sstore2_d)
Exemplo n.º 11
0
def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert multiple ``spacy.Doc`` s into a gensim dictionary and bag-of-words corpus.

    Args:
        spacy_docs (list(``spacy.Doc``))
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list(list((int, int))): list of bag-of-words documents, where each doc is
            a list of (integer word ID, word count) 2-tuples
    """
    gdict = Dictionary()
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        if lemmatize is True:
            bow = (
                (spacy_vocab[tok_id], count)
                for tok_id, count in spacy_doc.count_by(attrs.LEMMA).items())
        else:
            bow = ((spacy_vocab[tok_id], count)
                   for tok_id, count in spacy_doc.count_by(attrs.ORTH).items())

        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)

        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gdict.num_docs += 1
        gdict.num_pos += sum(count for _, count in bow)
        gdict.num_nnz += len(bow)

        gcorpus.append(bow)

    gdict.token2id = {s: i for i, s in enumerate(stringstore)}
    gdict.dfs = dict(doc_freqs)

    return (gdict, gcorpus)
Exemplo n.º 12
0
def sort(path: Path):
    """Sort the strings from the vocabulary of a spaCy model.

    For the original code of StringStore.to_disk(), see https://github.com/explosion/spaCy/blob/53a3b967ac704ff0a67a7102ede6d916e2a4545a/spacy/strings.pyx#L219-L227.
    """
    st = StringStore().from_disk(path)
    strings = sorted(st)
    srsly.write_json(path, strings)
Exemplo n.º 13
0
    def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a directory.

        path (unicode / Path): The path to load from.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        path = Path(path)
        strings_path = path / "strings.json"
        freqs_path = path / "freqs.json"
        self.vectors = Vectors().from_disk(path)
        self.cfg.update(srsly.read_json(path / "cfg"))
        if freqs_path.exists():
            self.freqs = dict(srsly.read_json(freqs_path))
        if "strings" not in exclude and strings_path.exists():
            self.strings = StringStore().from_disk(strings_path)
        return self
Exemplo n.º 14
0
def test_pickle_string_store(text1, text2):
    stringstore = StringStore()
    store1 = stringstore[text1]
    store2 = stringstore[text2]
    data = srsly.pickle_dumps(stringstore, protocol=-1)
    unpickled = srsly.pickle_loads(data)
    assert unpickled[text1] == store1
    assert unpickled[text2] == store2
    assert len(stringstore) == len(unpickled)
Exemplo n.º 15
0
def docs_to_gensim(spacy_docs,
                   spacy_vocab,
                   lemmatize=True,
                   lowercase=False,
                   filter_stops=True,
                   filter_punct=True,
                   filter_nums=False):
    """
    Convert a sequence of ``spacy.Doc`` s into a gensim-friendly corpus and a
    string that can be loaded into a :class:`gensim.corpora.Dictionary`.

    Args:
        spacy_docs (Iterable[``spacy.Doc``])
        spacy_vocab (``spacy.Vocab``)
        lemmatize (bool): if True, use lemmatized strings for words
        lowercase (bool): if True (and ``lemmatize`` is False), use lowercased
            strings for words
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        str: words, their integer ids, and their document frequencies in
            ``spacy_docs``, as a string formatted like `id[TAB]word[TAB]df[NEWLINE]`;
            when written to file, can be converted into a gensim ``Dictionary``
            via :meth:`gensim.corpora.Dictionary.load_from_text()`
        List[List[Tuple[int, int]]]: list of documents as bags-of-words, where
            each doc is a list of (integer word ID, word count) 2-tuples
    """
    count_by = (attrs.LEMMA if lemmatize is True else
                attrs.LOWER if lowercase is True else attrs.ORTH)
    gcorpus = []
    stringstore = StringStore()
    doc_freqs = Counter()

    for spacy_doc in spacy_docs:
        bow = ((spacy_vocab[tok_id], count)
               for tok_id, count in spacy_doc.count_by(count_by).items())
        bow = ((lex, count) for lex, count in bow if not lex.is_space)
        if filter_stops is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_stop)
        if filter_punct is True:
            bow = ((lex, count) for lex, count in bow if not lex.is_punct)
        if filter_nums is True:
            bow = ((lex, count) for lex, count in bow if not lex.like_num)
        bow = sorted(((stringstore[lex.orth_], count) for lex, count in bow),
                     key=itemgetter(0))

        doc_freqs.update(tok_id for tok_id, _ in bow)
        gcorpus.append(bow)

    gdict_str = '\n'.join(
        '{}\t{}\t{}'.format(i, s, doc_freqs[i])
        for i, s in sorted(enumerate(stringstore), key=itemgetter(1)))

    return (gdict_str, gcorpus)
Exemplo n.º 16
0
def test_unapplicable_trees():
    strings = StringStore()
    trees = EditTrees(strings)
    tree3 = trees.add("deelt", "delen")

    # Replacement fails.
    assert trees.apply(tree3, "deeld") == None

    # Suffix + prefix are too large.
    assert trees.apply(tree3, "de") == None
Exemplo n.º 17
0
def test_dutch():
    strings = StringStore()
    trees = EditTrees(strings)
    tree = trees.add("deelt", "delen")
    assert trees.tree_to_str(
        tree) == "(m 0 3 () (m 0 2 (s '' 'l') (s 'lt' 'n')))"

    tree = trees.add("gedeeld", "delen")
    assert (trees.tree_to_str(tree) ==
            "(m 2 3 (s 'ge' '') (m 0 2 (s '' 'l') (s 'ld' 'n')))")
Exemplo n.º 18
0
    def __init__(self):
        import os
        from sagas.conf.conf import cf
        from pyltp import Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
        from spacy.strings import StringStore

        self.stringstore = StringStore()

        MODELDIR = f'{cf.conf_dir}/ai/ltp/ltp_data_v3.4.0'
        self.postagger = Postagger()
        self.postagger.load(os.path.join(MODELDIR, "pos.model"))
        par_model_path = os.path.join(MODELDIR, 'parser.model')
        self.parser = Parser()
        self.parser.load(par_model_path)
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(MODELDIR, "ner.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(MODELDIR, "pisrl.model"))

        self.conf = AnalConf('zh')
        self.conf.setup(self)
Exemplo n.º 19
0
    def test_encode_decode(self):
        strings = StringStore()
        hello_id = strings[u'Hello']
        world_id = strings[u'World']

        self.assertNotEqual(hello_id, world_id)

        self.assertEqual(strings[hello_id], u'Hello')
        self.assertEqual(strings[world_id], u'World')

        self.assertEqual(strings[u'Hello'], hello_id)
        self.assertEqual(strings[u'World'], world_id)
Exemplo n.º 20
0
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    sstore1_b = sstore1.to_bytes()
    sstore2_b = sstore2.to_bytes()
    if strings1 == strings2:
        assert sstore1_b == sstore2_b
    else:
        assert sstore1_b != sstore2_b
    sstore1 = sstore1.from_bytes(sstore1_b)
    assert sstore1.to_bytes() == sstore1_b
    new_sstore1 = StringStore().from_bytes(sstore1_b)
    assert new_sstore1.to_bytes() == sstore1_b
    assert list(new_sstore1) == strings1
Exemplo n.º 21
0
def word_movers(doc1, doc2, metric='cosine'):
    """
    Measure the semantic similarity between two documents using Word Movers
    Distance.

    Args:
        doc1 (``textacy.Doc`` or ``spacy.Doc``)
        doc2 (``textacy.Doc`` or ``spacy.Doc``)
        metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'})

    Returns:
        float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0],
            where larger values correspond to more similar documents

    References:
        Ofir Pele and Michael Werman, "A linear time histogram metric for improved
            SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
            in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        Kusner, Matt J., et al. "From word embeddings to document distances."
            Proceedings of the 32nd International Conference on Machine Learning
            (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    stringstore = StringStore()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector:
            if stringstore[
                    word.text] - 1 == n:  # stringstore[0] always empty space
                word_vecs.append(word.vector)
                n += 1
    distance_mat = pairwise_distances(np.array(word_vecs),
                                      metric=metric).astype(np.double)
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc1)
                               if word.has_vector)
    vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc2)
                               if word.has_vector)
    vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
Exemplo n.º 22
0
def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
Exemplo n.º 23
0
def test_from_to_bytes():
    strings = StringStore()
    trees = EditTrees(strings)
    trees.add("deelt", "delen")
    trees.add("gedeeld", "delen")

    b = trees.to_bytes()

    trees2 = EditTrees(strings)
    trees2.from_bytes(b)

    # Verify that the nodes did not change.
    assert len(trees) == len(trees2)
    for i in range(len(trees)):
        assert trees.tree_to_str(i) == trees2.tree_to_str(i)

    # Reinserting the same trees should not add new nodes.
    trees2.add("deelt", "delen")
    trees2.add("gedeeld", "delen")
    assert len(trees) == len(trees2)
Exemplo n.º 24
0
def test_from_to_disk():
    strings = StringStore()
    trees = EditTrees(strings)
    trees.add("deelt", "delen")
    trees.add("gedeeld", "delen")

    trees2 = EditTrees(strings)
    with make_tempdir() as temp_dir:
        trees_file = temp_dir / "edit_trees.bin"
        trees.to_disk(trees_file)
        trees2 = trees2.from_disk(trees_file)

    # Verify that the nodes did not change.
    assert len(trees) == len(trees2)
    for i in range(len(trees)):
        assert trees.tree_to_str(i) == trees2.tree_to_str(i)

    # Reinserting the same trees should not add new nodes.
    trees2.add("deelt", "delen")
    trees2.add("gedeeld", "delen")
    assert len(trees) == len(trees2)
Exemplo n.º 25
0
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / "strings1"
        file_path2 = d / "strings2"
        sstore1.to_disk(file_path1)
        sstore2.to_disk(file_path2)
        sstore1_d = StringStore().from_disk(file_path1)
        sstore2_d = StringStore().from_disk(file_path2)
        assert list(sstore1_d) == list(sstore1)
        assert list(sstore2_d) == list(sstore2)
        if strings1 == strings2:
            assert list(sstore1_d) == list(sstore2_d)
        else:
            assert list(sstore1_d) != list(sstore2_d)
def morphology():
    morphology = Morphology(StringStore())
    morphology.add("Feat1=Val1|Feat2=Val2")
    morphology.add("Feat3=Val3|Feat4=Val4")
    return morphology
Exemplo n.º 27
0
 def test_create(self):
     lemmatizer = Lemmatizer({}, {}, {})
     strings = StringStore()
     lemmatizer = Lemmatizer({}, {}, {})
     morphology = Morphology(strings, {}, lemmatizer)
Exemplo n.º 28
0
class Sense2Vec(object):
    def __init__(
            self,
            shape: tuple = (1000, 128),
            strings: StringStore = None,
            senses: List[str] = [],
            vectors_name: str = "sense2vec",
            overrides: Dict[str, str] = SimpleFrozenDict(),
    ):
        """Initialize the Sense2Vec object.

        shape (tuple): The vector shape.
        strings (StringStore): Optional string store. Will be created if it
            doesn't exist.
        senses (list): Optional list of all available senses. Used in methods
            that generate the best sense or other senses.
        vectors_name (unicode): Optional name to assign to the Vectors object.
        overrides (dict): Optional custom functions to use, mapped to names
            registered via the registry, e.g. {"make_key": "custom_make_key"}.
        RETURNS (Sense2Vec): The newly constructed object.
        """
        self.vectors = Vectors(shape=shape, name=vectors_name)
        self._row2key = None
        self.strings = StringStore() if strings is None else strings
        self.freqs: Dict[int, int] = {}
        self.cache = None
        self.cfg: Dict[str, Any] = {
            "senses": senses,
            "make_key": "default",
            "split_key": "default",
        }
        self.cfg.update(overrides)

    @property
    def senses(self) -> Sequence[str]:
        """RETURNS (list): The available senses."""
        return self.cfg.get("senses", [])

    @property
    def frequencies(self) -> List[Tuple[str, int]]:
        """RETURNS (list): The (key, freq) tuples by frequency, descending."""
        freqs = [(self.strings[k], s) for k, s in self.freqs.items()
                 if s is not None]
        return sorted(freqs, key=lambda item: item[1], reverse=True)

    def __len__(self) -> int:
        """RETURNS (int): The number of rows in the vectors table."""
        return len(self.vectors)

    def __contains__(self, key: Union[str, int]) -> bool:
        """Check if a key is in the vectors table.

        key (unicode / int): The key to look up.
        RETURNS (bool): Whether the key is in the table.
        """
        key = self.ensure_int_key(key)
        return key in self.vectors

    def __getitem__(self, key: Union[str, int]) -> Union[numpy.ndarray, None]:
        """Retrieve a vector for a given key. Returns None if the key is not
        in the table.

        key (unicode / int): The key to look up.
        RETURNS (numpy.ndarray): The vector.
        """
        key = self.ensure_int_key(key)
        if key in self.vectors:
            return self.vectors[key]
        return None

    def __setitem__(self, key: Union[str, int], vector: numpy.ndarray):
        """Set a vector for a given key. Will raise an error if the key
        doesn't exist.

        key (unicode / int): The key.
        vector (numpy.ndarray): The vector to set.
        """
        key = self.ensure_int_key(key)
        if key not in self.vectors:
            raise ValueError(f"Can't find key {key} in table")
        self.vectors[key] = vector
        self._row2key = None

    def __iter__(self):
        """YIELDS (tuple): String key and vector pairs in the table."""
        yield from self.items()

    def items(self):
        """YIELDS (tuple): String key and vector pairs in the table."""
        for key, value in self.vectors.items():
            yield self.strings[key], value

    def keys(self):
        """YIELDS (unicode): The string keys in the table."""
        for key in self.vectors.keys():
            yield self.strings[key]

    def values(self):
        """YIELDS (numpy.ndarray): The vectors in the table."""
        yield from self.vectors.values()

    @property
    def row2key(self):
        if not self._row2key:
            self._row2key = {
                row: key
                for key, row in self.vectors.key2row.items()
            }
        return self._row2key

    @property
    def make_key(self) -> Callable:
        """Get the function to make keys."""
        return registry.make_key.get(self.cfg["make_key"])

    @property
    def split_key(self) -> Callable:
        """Get the function to split keys."""
        return registry.split_key.get(self.cfg["split_key"])

    def add(self,
            key: Union[str, int],
            vector: numpy.ndarray,
            freq: int = None):
        """Add a new vector to the table.

        key (unicode / int): The key to add.
        vector (numpy.ndarray): The vector to add.
        freq (int): Optional frequency count.
        """
        if not isinstance(key, int):
            key = self.strings.add(key)
        self.vectors.add(key, vector=vector)
        if freq is not None:
            self.set_freq(key, freq)
        self._row2key = None

    def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]:
        """Get the frequency count for a given key.

        key (unicode / int): They key to look up.
        default: Default value to return if no frequency is found.
        RETURNS (int): The frequency count.
        """
        key = self.ensure_int_key(key)
        return self.freqs.get(key, default)

    def set_freq(self, key: Union[str, int], freq: int):
        """Set a frequency count for a given key.

        key (unicode / int): The key to set the count for.
        freq (int): The frequency count.
        """
        if not isinstance(freq, int):
            raise ValueError(
                f"Invalid frequency count: {repr(freq)} for '{key}'")
        key = self.ensure_int_key(key)
        self.freqs[key] = freq

    def ensure_int_key(self, key: Union[str, int]) -> int:
        """Ensure that a key is an int by looking it up in the string store.

        key (unicode / int): The key.
        RETURNS (int): The integer key.
        """
        return key if isinstance(key, int) else self.strings.add(key)

    def similarity(
        self,
        keys_a: Union[Sequence[Union[str, int]], str, int],
        keys_b: Union[Sequence[Union[str, int]], str, int],
    ) -> float:
        """Make a semantic similarity estimate of two keys or two sets of keys.
        The default estimate is cosine similarity using an average of vectors.

        keys_a (unicode / int / iterable): The string or integer key(s).
        keys_b (unicode / int / iterable): The other string or integer key(s).
        RETURNS (float): The similarity score.
        """
        if isinstance(keys_a, (str, int)):
            keys_a = [keys_a]
        if isinstance(keys_b, (str, int)):
            keys_b = [keys_b]
        average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0)
        average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0)
        return cosine_similarity(average_a, average_b)

    def most_similar(
        self,
        keys: Union[Sequence[Union[str, int]], str, int],
        n: int = 10,
        batch_size: int = 16,
    ) -> List[Tuple[str, float]]:
        """Get the most similar entries in the table. If more than one key is
        provided, the average of the vectors is used.

        keys (unicode / int / iterable): The string or integer key(s) to compare to.
        n (int): The number of similar keys to return.
        batch_size (int): The batch size to use.
        RETURNS (list): The (key, score) tuples of the most similar vectors.
        """
        if isinstance(keys, (str, int)):
            keys = [keys]
        for key in keys:
            if key not in self:
                raise ValueError(f"Can't find key {key} in table")
        if self.cache and self.cache["indices"].shape[1] >= n:
            n = min(len(self.vectors), n)
            key = self.ensure_int_key(key)
            key_row = self.vectors.find(key=key)
            if key_row < self.cache["indices"].shape[0]:
                rows = self.cache["indices"][key_row, :n]
                scores = self.cache["scores"][key_row, :n]
                entries = zip(rows, scores)
                entries = [(self.strings[self.row2key[r]], score)
                           for r, score in entries if r in self.row2key]
                return entries
        # Always ask for more because we'll always get the keys themselves
        n = min(len(self.vectors), n + len(keys))
        rows = numpy.asarray(self.vectors.find(keys=keys))
        vecs = self.vectors.data[rows]
        average = vecs.mean(axis=0, keepdims=True)
        result_keys, _, scores = self.vectors.most_similar(
            average, n=n, batch_size=batch_size)
        result = list(zip(result_keys.flatten(), scores.flatten()))
        result = [(self.strings[key], score) for key, score in result if key]
        result = [(key, score) for key, score in result if key not in keys]
        return result

    def get_other_senses(self,
                         key: Union[str, int],
                         ignore_case: bool = True) -> List[str]:
        """Find other entries for the same word with a different sense, e.g.
        "duck|VERB" for "duck|NOUN".

        key (unicode / int): The key to check.
        ignore_case (bool): Check for uppercase, lowercase and titlecase.
        RETURNS (list): The string keys of other entries with different senses.
        """
        result = []
        key = key if isinstance(key, str) else self.strings[key]
        word, orig_sense = self.split_key(key)
        versions = [word, word.upper(), word.title()
                    ] if ignore_case else [word]
        for text in versions:
            for sense in self.senses:
                new_key = self.make_key(text, sense)
                if sense != orig_sense and new_key in self:
                    result.append(new_key)
        return result

    def get_best_sense(self,
                       word: str,
                       senses: Sequence[str] = tuple(),
                       ignore_case: bool = True) -> Union[str, None]:
        """Find the best-matching sense for a given word based on the available
        senses and frequency counts. Returns None if no match is found.

        word (unicode): The word to check.
        senses (list): Optional list of senses to limit the search to. If not
            set / empty, all senses in the vectors are used.
        ignore_case (bool): Check for uppercase, lowercase and titlecase.
        RETURNS (unicode): The best-matching key or None.
        """
        sense_options = senses or self.senses
        if not sense_options:
            return None
        versions = [word, word.upper(), word.title()
                    ] if ignore_case else [word]
        freqs = []
        for text in versions:
            for sense in sense_options:
                key = self.make_key(text, sense)
                if key in self:
                    freq = self.get_freq(key, -1)
                    freqs.append((freq, key))
        return max(freqs)[1] if freqs else None

    def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes:
        """Serialize a Sense2Vec object to a bytestring.

        exclude (list): Names of serialization fields to exclude.
        RETURNS (bytes): The serialized Sense2Vec object.
        """
        vectors_bytes = self.vectors.to_bytes()
        freqs = list(self.freqs.items())
        data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs}
        if "strings" not in exclude:
            data["strings"] = self.strings.to_bytes()
        if "cache" not in exclude:
            data["cache"] = self.cache
        return srsly.msgpack_dumps(data)

    def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a bytestring.

        bytes_data (bytes): The data to load.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        data = srsly.msgpack_loads(bytes_data)
        self.vectors = Vectors().from_bytes(data["vectors"])
        self.freqs = dict(data.get("freqs", []))
        self.cfg.update(data.get("cfg", {}))
        if "strings" not in exclude and "strings" in data:
            self.strings = StringStore().from_bytes(data["strings"])
        if "cache" not in exclude and "cache" in data:
            self.cache = data.get("cache", {})
        self._row2key = None
        return self

    def to_disk(self, path: Union[Path, str],
                exclude: Sequence[str] = tuple()):
        """Serialize a Sense2Vec object to a directory.

        path (unicode / Path): The path.
        exclude (list): Names of serialization fields to exclude.
        """
        path = Path(path)
        self.vectors.to_disk(path)
        srsly.write_json(path / "cfg", self.cfg)
        srsly.write_json(path / "freqs.json", list(self.freqs.items()))
        if "strings" not in exclude:
            self.strings.to_disk(path / "strings.json")
        if "cache" not in exclude and self.cache:
            srsly.write_msgpack(path / "cache", self.cache)

    def from_disk(self,
                  path: Union[Path, str],
                  exclude: Sequence[str] = tuple()):
        """Load a Sense2Vec object from a directory.

        path (unicode / Path): The path to load from.
        exclude (list): Names of serialization fields to exclude.
        RETURNS (Sense2Vec): The loaded object.
        """
        path = Path(path)
        strings_path = path / "strings.json"
        freqs_path = path / "freqs.json"
        cache_path = path / "cache"
        self.vectors = Vectors().from_disk(path)
        self.cfg.update(srsly.read_json(path / "cfg"))
        if freqs_path.exists():
            self.freqs = dict(srsly.read_json(freqs_path))
        if "strings" not in exclude and strings_path.exists():
            self.strings = StringStore().from_disk(strings_path)
        if "cache" not in exclude and cache_path.exists():
            self.cache = srsly.read_msgpack(cache_path)
        self._row2key = None
        return self
Exemplo n.º 29
0
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    sstore1_b = sstore1.to_bytes()
    sstore2_b = sstore2.to_bytes()
    if strings1 == strings2:
        assert sstore1_b == sstore2_b
    else:
        assert sstore1_b != sstore2_b
    sstore1 = sstore1.from_bytes(sstore1_b)
    assert sstore1.to_bytes() == sstore1_b
    new_sstore1 = StringStore().from_bytes(sstore1_b)
    assert new_sstore1.to_bytes() == sstore1_b
    assert list(new_sstore1) == strings1
Exemplo n.º 30
0
def build_doc_term_matrix(terms_lists,
                          weighting='tf',
                          normalize=False, sublinear_tf=False, smooth_idf=True,
                          min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None):
    """
    Build a document-term matrix of shape (# docs, # unique terms) from a sequence
    of documents, each represented as a sequence of (str) terms, with a variety of
    weighting and normalization schemes for the matrix values.

    Args:
        terms_lists (iterable(iterable(str))): a sequence of documents, each as a
            sequence of (str) terms; note that the terms in each doc are to be
            counted, so these probably shouldn't be sets containing *unique* terms;
            example inputs::

                >>> ([tok.lemma_ for tok in spacy_doc]
                ...  for spacy_doc in spacy_docs)
                >>> ((ne.text for ne in doc.named_entities())
                ...  for doc in textcorpus)
                >>> (tuple(ng.text for ng in itertools.chain.from_iterable(doc.ngrams(i) for i in range(1, 3)))
                ...  for doc in docs)

        weighting (str {'tf', 'tfidf', 'binary'}, optional): if 'tf', matrix values
            (i, j) correspond to the number of occurrences of term j in doc i; if
            'tfidf', term frequencies (tf) are multiplied by their corresponding
            inverse document frequencies (idf); if 'binary', all non-zero values
            are set equal to 1
        normalize (bool, optional): if True, normalize term frequencies by the
            L2 norms of the vectors
        binarize (bool, optional): if True, set all term frequencies greater than
            0 equal to 1
        sublinear_tf (bool, optional): if True, apply sub-linear term-frequency
            scaling, i.e. tf => 1 + log(tf)
        smooth_idf (bool, optional): if True, add 1 to all document frequencies,
            equivalent to adding a single document to the corpus containing every
            unique term
        min_df (float or int, optional): if float, value is the fractional proportion
            of the total number of documents, which must be in [0.0, 1.0]; if int,
            value is the absolute number; filter terms whose document frequency
            is less than ``min_df``
        max_df (float or int, optional): if float, value is the fractional proportion
            of the total number of documents, which must be in [0.0, 1.0]; if int,
            value is the absolute number; filter terms whose document frequency
            is greater than ``max_df``
        min_ic (float, optional): filter terms whose information content is less
            than `min_ic`; value must be in [0.0, 1.0]
        max_n_terms (int, optional): only include terms whose document frequency
            is within the top ``max_n_terms``

    Returns:
        :class:`scipy.sparse.csr_matrix <scipy.sparse.csr_matrix>`: sparse matrix
            of shape (# docs, # unique terms), where value (i, j) is the weight
            of term j in doc i
        dict: id to term mapping, where keys are unique integers as term ids and
            values are corresponding strings
    """
    stringstore = StringStore()
    data = []; rows = []; cols = []
    for row_idx, terms_list in enumerate(terms_lists):

        # an empty string always occupies index 0 in the stringstore, which causes
        # an empty first col in the doc-term matrix that we don't want;
        # so, we subtract 1 from the stringstore's assigned id
        bow = tuple((stringstore[term] - 1, count)
                    for term, count in collections.Counter(terms_list).items()
                    if term)

        data.extend(count for _, count in bow)
        cols.extend(term_id for term_id, _ in bow)
        rows.extend(itertools.repeat(row_idx, times=len(bow)))

    doc_term_matrix = sp.coo_matrix((data, (rows, cols)), dtype=int).tocsr()
    # ignore the 0-index empty string in stringstore, as above
    id_to_term = {term_id - 1: term for term_id, term in enumerate(stringstore)
                  if term_id != 0}

    # filter terms by document frequency or information content?
    if max_df != 1.0 or min_df != 1 or max_n_terms is not None:
        doc_term_matrix, id_to_term = filter_terms_by_df(
            doc_term_matrix, id_to_term,
            max_df=max_df, min_df=min_df, max_n_terms=max_n_terms)
    if min_ic != 0.0:
        doc_term_matrix, id_to_term = filter_terms_by_ic(
            doc_term_matrix, id_to_term,
            min_ic=min_ic, max_n_terms=max_n_terms)

    if weighting == 'binary':
        doc_term_matrix = binarize_mat(doc_term_matrix, threshold=0.0, copy=False)
    else:
        if sublinear_tf is True:
            doc_term_matrix = doc_term_matrix.astype(np.float64)
            np.log(doc_term_matrix.data, doc_term_matrix.data)
            doc_term_matrix.data += 1
        if weighting == 'tfidf':
            doc_term_matrix = apply_idf_weighting(doc_term_matrix,
                                                  smooth_idf=smooth_idf)

    if normalize is True:
        doc_term_matrix = normalize_mat(doc_term_matrix,
                                        norm='l2', axis=1, copy=False)

    return (doc_term_matrix, id_to_term)
Exemplo n.º 31
0
def morphology():
    lemmatizer = Lemmatizer(Lookups())
    return Morphology(StringStore(), {}, lemmatizer)
Exemplo n.º 32
0
def sstore():
    return StringStore()
Exemplo n.º 33
0
def test_stringstore_to_bytes(stringstore, text):
    store = stringstore.add(text)
    serialized = stringstore.to_bytes()
    new_stringstore = StringStore().from_bytes(serialized)
    assert new_stringstore[store] == text
Exemplo n.º 34
0
def stringstore():
    return StringStore()