示例#1
0
class SIFFeaturizer(IFeaturizer):
    def __init__(self, config=dict()):
        super(SIFFeaturizer, self).__init__()

        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.tokenize_fn = word_tokenize
        self.use_tokenizer = config.get('use_tokenizer', False)

        self.tokenizer = Tokenizer(num_words=self.num_words)

    def get_output_shape(self):
        return (300, )

    def fit(self, data):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        tokens = [self.tokenize_fn(sent) for sent in data]
        self.tokenizer.fit_on_texts(tokens)

    def transform(self, data):
        raw_tokens = [self.tokenize_fn(sent) for sent in data]
        tokens = self.tokenizer.texts_to_sequences(raw_tokens)
        tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf')

        maxlen = max([len(sent) for sent in tokens])
        tfidf_weights = np.zeros((len(tokens), maxlen))
        for i, seq in enumerate(raw_tokens):
            for j, raw_token in enumerate(seq):
                token = -1
                if raw_token in self.tokenizer.word_index:
                    token = self.tokenizer.word_index[raw_token]
                # else:
                #     similar_to_raw_token = most_similar(raw_token)
                #     for similar_word in similar_to_raw_token:
                #         print(similar_to_raw_token)
                #         if similar_word in self.tokenizer.word_index:
                #             token = self.tokenizer.word_index[similar_word]
                #             print('Word not found: %s but similar word found: %s' % (raw_token, similar_word))
                #             break
                if token > -1:
                    tfidf_weights[i][j] = tfidf_matrix[i][token]
                else:
                    tfidf_weights[i][j] = 1  # default weight to 1

        # convert from token back to texts
        # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited)
        # embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens))
        # print(raw_tokens)
        embs = word_to_vec(raw_tokens)

        if embs is None: return None

        sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0)

        return torch.from_numpy(sif_emb).float()
示例#2
0
class OvrClassifierWrapper(IModel):
    def __init__(self, config={}, *args, **kwargs):
        super(OvrClassifierWrapper, self).__init__(model_class=OvrClassifier,
                                                   config=config,
                                                   *args,
                                                   **kwargs)

        self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
        self.num_words = config.get('num_words', MAX_NUM_WORDS)
        self.n_classes = config.get('num_classes', 10)

        self.tokenize_fn = wordpunct_tokenize
        self.label_encoder = LabelEncoder()

    def get_state_dict(self):
        return {
            'tokenizer': self.tokenizer,
            'config': self.model.config,
            'label_encoder': self.label_encoder,
            'state_dict': self.model.get_params(),
        }

    def load_state_dict(self, state_dict):
        config = state_dict['config']

        # re-initialize model with loaded config
        self.model = self.init_model()
        self.model.set_params(state_dict['state_dict'])

        # load tokenizer
        self.tokenizer = state_dict['tokenizer']

        # load label encoder
        self.label_encoder = state_dict['label_encoder']

    def preprocess_input(self, X):
        if self.tokenizer is None:
            self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)

        tokens = [self.tokenize_fn(sent) for sent in X]
        tokens = self.tokenizer.texts_to_sequences(tokens)
        tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf')

        maxlen = max([len(sent) for sent in tokens])
        tfidf_weights = np.zeros((len(tokens), maxlen))
        for i, seq in enumerate(tokens):
            for j, token in enumerate(seq):
                if token < self.tokenizer.num_words:
                    tfidf_weights[i][j] = tfidf_matrix[i][token]

        # convert from token back to texts
        # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited)
        embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens))

        sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0)

        return torch.from_numpy(sif_emb).float()

    def preprocess_output(self, y):
        # One-hot encode outputs
        # Can also use torch.eye() but leaving as numpy until torch achieves performance parity
        # lookup = np.eye(self.num_classes)
        # outputs = np.array([lookup[label] for label in y])
        # return torch.from_numpy(outputs).float()

        return torch.from_numpy(self.label_encoder.transform(y)).long()

    def infer_predict(self, logits, topk=None):
        return infer_classification_output(self, logits, topk)