def transform(self, X, Y=None):
     success, _X = self.from_cache(X)
     if not success:
         _X = []
         for document in X:
             sents = splitter(document)
             blocks, block = [], []
             while sents:
                 sent = sents.pop(0)
                 block.append(sent)
                 if len(block) == self.block_size:
                     blocks.append(' '.join(block))
                     block = []
             blocks.append(' '.join(block))
             encoded_blocks = self.model.encode(blocks)
             doc_vecs = pooling(
                 encoded_blocks,
                 (encoded_blocks.shape[0], 1),
                 method='avg'
             )
             _X.append(list(doc_vecs)[0])
         self.to_cache(X, _X)
     if Y:
         return np.array(_X), Y
     else:
         return np.array(_X)
    def __call__(self, x):
        bow = []
        for sent in splitter(x):
            for i, token in enumerate(tokenizer(sent)):

                if self.remove_nonalpha and not token.isalpha():
                    continue

                if (self.remove_entities and i
                        and token[0] != token[0].lower()):
                    continue

                if (self.remove_stopwords and token.lower() in STOPWORDS):
                    continue

                bow.append(token if not self.lowercase else token.lower())

        _bow = []
        prev = None
        while bow:
            token = bow.pop(0)
            if token == prev:
                continue
            _bow.append(token)
            prev = token

        return ' '.join(_bow)
 def transform(self, X, Y=None):
     success, _X = self.from_cache(X)
     if not success:
         _X = []
         for document in X:
             sents = splitter(document)
             encoded_sents = self.model.encode(sents)
             doc_vecs = pooling(
                 encoded_sents,
                 (encoded_sents.shape[0], 1),
                 method='mdn'
             )
             _X.append(list(doc_vecs)[0])
         self.to_cache(X, _X)
     if Y:
         return np.array(_X), Y
     else:
         return np.array(_X)
예제 #4
0
def title(X, docid):
    return splitter(X[docid])[0]
    return '%s...' % X[docid][:90]