def _feature_extractor(self, doc): features = np.asarray([self.word_vocab[w] if self.word_vocab[w] is not None else 1 for w in doc]) if self.char_vocab: sentence_chars = [] for w in doc: word_chars = [] for c in w: _cid = self.char_vocab[c] word_chars.append(_cid if _cid is not None else 1) sentence_chars.append(word_chars) sentence_chars = pad_sentences(sentence_chars, self.model.max_word_len) features = (features, sentence_chars) return features
def vectorize(doc, w_vocab, c_vocab): words = np.asarray([w_vocab[w.lower()] if w.lower() in w_vocab else 1 for w in doc]).reshape( 1, -1 ) sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in c_vocab: _cid = c_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, model.word_length), axis=0) return words, sentence_chars
def vectorize(doc, vocab, char_vocab=None): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\ .reshape(1, -1) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences(sentence_chars, model.word_length), axis=0) return [words, sentence_chars] else: return words
def __call__(self, doc: Doc) -> Doc: """ Annotate the document with noun phrase spans """ spans = [] doc_vecs = [] doc_lens = [] for sentence in doc.sents: doc_vec = self._feature_extractor([t.text for t in sentence]) doc_vecs.append(doc_vec) doc_lens.append(len(doc_vec)) doc_vectors = pad_sentences(doc_vecs) np_indexes = self._infer_chunks(doc_vectors, doc_lens) for s, e in np_indexes: np_span = Span(doc, s, e) spans.append(np_span) spans = _NPPostprocessor.process(spans) set_noun_phrases(doc, spans) return doc
def vectorize(docs, w_vocab, c_vocab): data = [] for doc in docs: words = np.asarray([ w_vocab[w.lower()] if w_vocab[w.lower()] is not None else 1 for w in doc ]).reshape(1, -1) if c_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: _cid = c_vocab[c] word_chars.append(_cid if _cid is not None else 1) sentence_chars.append(word_chars) sentence_chars = np.expand_dims(pad_sentences( sentence_chars, word_length), axis=0) data.append((words, sentence_chars)) else: data.append(words) return data
def __call__(self, doc: Doc) -> Doc: """ Annotate the document with noun phrase spans """ spans = [] doc_vecs = [] doc_chars = [] doc_lens = [] if len(doc) < 1: return doc for sentence in doc.sents: features = self._feature_extractor([t.text for t in sentence]) if isinstance(features, tuple): doc_vec = features[0] doc_chars.append(features[1]) else: doc_vec = features doc_vecs.append(doc_vec) doc_lens.append(len(doc_vec)) doc_vectors = pad_sentences(np.asarray(doc_vecs)) inputs = doc_vectors if self.char_vocab: max_len = doc_vectors.shape[1] padded_chars = np.zeros( (len(doc_chars), max_len, self.model.max_word_len)) for idx, d in enumerate(doc_chars): d = d[:max_len] padded_chars[idx, -d.shape[0]:] = d inputs = [inputs, padded_chars] np_indexes = self._infer_chunks(inputs, doc_lens) for s, e in np_indexes: np_span = Span(doc, s, e) spans.append(np_span) spans = _NPPostprocessor.process(spans) set_noun_phrases(doc, spans) return doc