def __init__(self, path, encoding='latin1'): self.dict = { hash_string(u"#re"): [], hash_string(u"#lvl2"): [], hash_string(u"#lvl3"): [] } self.index = [] self.array = [] self.add_dict(path, encoding)
def Eg(self, text, opt=None, label=None): eg = self._eg eg.reset() doc = self.nlp(text) features = [] word_types = set() i = 0 for token in doc[:-1]: next_token = doc[i + 1] strings = (token.lower_, next_token.lower_) key = hash_string('%s_%s' % strings) feat_slot = 0 feat_value = 1 features.append((0, token.lower, 1)) features.append((feat_slot, key, feat_value)) i += 1 eg.features = features if opt is not None: eg.is_valid = [(clas in opt) for clas in range(self.nr_class)] if label is not None: eg.costs = [clas != label for clas in range(self.nr_class)] return eg
def bucketize(self): self.table = [set() for i in range(self.N)] for (i, words) in enumerate( tqdm(self.transformer.inverse_transform(self.vectors))): for w in words: h = hash_string(str(w)) self.table[h % self.N].add(i)
def get_word_vector(w): h = hash_string(w.lower()) i = _vectors.key2row.get(h, 0) # i = i if len(_vectors.data) > i else 0: if len(_vectors.data) > i: return _vectors.data[i] return np.zeros(_vector_size)
def test_get_vector(strings, data): v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0]) assert list(v[strings[0]]) != list(data[1]) assert list(v[strings[1]]) != list(data[0])
def serialize_doc(doc): doc_byte_string = doc.to_bytes() # doc_user_data_string = "" if len(doc.user_data) == 0 else pickle.dumps(doc.user_data, pickle.HIGHEST_PROTOCOL) value = { _DOC_BYTE_STRING: str(doc_byte_string), _USER_DATA: doc.user_data, _HASH: str(hash_string(doc.string)) } return pickle.dumps(value, pickle.HIGHEST_PROTOCOL).encode('base64')
def main(patterns_loc, text_loc, counts_loc, n=10000000): nlp = English(parser=False, tagger=False, entity=False) print("Make matcher") phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) counts = PreshCounter() t1 = time.time() for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): counts.inc(hash_string(mwe.text), 1) t2 = time.time() print("10m tokens in %d s" % (t2 - t1)) with codecs.open(counts_loc, 'w', 'utf8') as file_: for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): text = phrase.string key = hash_string(text) count = counts[key] if count != 0: file_.write('%d\t%s\n' % (count, text))
def get_matches(matcher, pattern_ids, doc): matches = [] for label, start, end in matcher(doc): candidate = doc[start : end] if pattern_ids[hash_string(candidate.text)] == True: start = candidate[0].idx end = candidate[-1].idx + len(candidate[-1]) matches.append((start, end, candidate.root.tag_, candidate.text)) return matches
def unserialize_doc(nlp, serialized_string): value = pickle.loads(serialized_string.decode('base64')) doc_byte_string = value[_DOC_BYTE_STRING] user_data = value[_USER_DATA] doc_hash = value[_HASH] doc = Doc(nlp.vocab).from_bytes(doc_byte_string) assert str(hash_string( doc.string)) == doc_hash, "the hash doesn't match the hash" doc.user_data = user_data return doc
def test_set_vector(strings, data): orig = data.copy() v = Vectors(data=data) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(orig[0]) assert list(v[strings[0]]) != list(orig[1]) v[strings[0]] = data[1] assert list(v[strings[0]]) == list(orig[1]) assert list(v[strings[0]]) != list(orig[0])
def test_get_vector_resize(strings, data, resize_data): v = Vectors(data=data) v.resize(shape=resize_data.shape) strings = [hash_string(s) for s in strings] for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(resize_data[0]) assert list(v[strings[0]]) != list(resize_data[1]) assert list(v[strings[1]]) != list(resize_data[0]) assert list(v[strings[1]]) == list(resize_data[1])
def get_entry(self, name): 'Get dictionary entry. Returns entries mapped to name\'s lemma. If the word doesn\'t exists returns an empty list.' name = name if name[0] == "#" else self.get_lemma(name) hash = hash_string(name) try: return self.array[list_index(self.index, hash)] except ValueError: try: return self.dict[hash] except KeyError: self.dict[hash] = [] return self.dict[hash]
def test_vectors_clear(): data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f") v = Vectors(data=data, keys=["A", "B", "C"]) assert v.is_full is True assert hash_string("A") in v v.clear() # no keys assert v.key2row == {} assert list(v) == [] assert v.is_full is False assert "A" not in v with pytest.raises(KeyError): v["A"]
def count_doc(self, words): # Get counts for this document doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) # TODO: Why doesn't inc return this? =/ corpus_count = self.counts[key] # Remember the string when we exceed min count if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n
def count_doc(self, words): doc_counts = PreshCounter() doc_strings = {} for word in words: key = hash_string(word) doc_counts.inc(key, 1) doc_strings[key] = word n = 0 for key, count in doc_counts: self.counts.inc(key, count) corpus_count = self.counts[key] if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: self.strings[key] = doc_strings[key] n += count return n
def test_get_vector_resize(strings, data): strings = [hash_string(s) for s in strings] # decrease vector dimension (truncate) v = Vectors(data=data) resized_dim = v.shape[1] - 1 v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0, :resized_dim]) assert list(v[strings[1]]) == list(data[1, :resized_dim]) # increase vector dimension (pad with zeros) v = Vectors(data=data) resized_dim = v.shape[1] + 1 v.resize(shape=(v.shape[0], resized_dim)) for i, string in enumerate(strings): v.add(string, row=i) assert list(v[strings[0]]) == list(data[0]) + [0] assert list(v[strings[1]]) == list(data[1]) + [0]
def main(): nlp = English(parser=False, tagger=False, entity=False) gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' pattern_ids = PreshMap() max_length = 0 for pattern_str in gazetteer: pattern = nlp.tokenizer(pattern_str) bilou_tags = get_bilou(len(pattern)) for word, tag in zip(pattern, bilou_tags): lexeme = nlp.vocab[word.orth] lexeme.set_flag(tag, True) pattern_ids[hash_string(pattern.text)] = True max_length = max(max_length, len(pattern)) matcher = make_matcher(nlp.vocab, max_length) doc = nlp(example_text) matches = get_matches(matcher, pattern_ids, doc) merge_matches(doc, matches) for token in doc: print(token.text, token.ent_type_)
def add_sentences(source): # Create sentences from scrapped items for doc in scrapdb[source].find(): for key, value in doc.items(): if key not in field2sent: continue for text in list(value): try: traindb['sentence'].insert({ '_id': 'S{}'.format(str(hash_string(text.lower()))), 'language': doc['language'], 'source': doc['isPartOf'], 'subject': 'cookery', 'text': text.lower() }) except pymongo.errors.DuplicateKeyError: continue
def contexts_by_entities(self, doc): """Returns a set of document ids that *might* be related to named entities in the pre-processed question""" ents = self.doc_entities(doc) buckets = [hash_string(word) % self.N for word in ents] return set([doc_id for slot in buckets for doc_id in self.table[slot]])
def hash(self): string = self.value return hash_string(string)