class SIFFeaturizer(IFeaturizer): def __init__(self, config=dict()): super(SIFFeaturizer, self).__init__() self.num_words = config.get('num_words', MAX_NUM_WORDS) self.tokenize_fn = word_tokenize self.use_tokenizer = config.get('use_tokenizer', False) self.tokenizer = Tokenizer(num_words=self.num_words) def get_output_shape(self): return (300, ) def fit(self, data): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in data] self.tokenizer.fit_on_texts(tokens) def transform(self, data): raw_tokens = [self.tokenize_fn(sent) for sent in data] tokens = self.tokenizer.texts_to_sequences(raw_tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(raw_tokens): for j, raw_token in enumerate(seq): token = -1 if raw_token in self.tokenizer.word_index: token = self.tokenizer.word_index[raw_token] # else: # similar_to_raw_token = most_similar(raw_token) # for similar_word in similar_to_raw_token: # print(similar_to_raw_token) # if similar_word in self.tokenizer.word_index: # token = self.tokenizer.word_index[similar_word] # print('Word not found: %s but similar word found: %s' % (raw_token, similar_word)) # break if token > -1: tfidf_weights[i][j] = tfidf_matrix[i][token] else: tfidf_weights[i][j] = 1 # default weight to 1 # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) # embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) # print(raw_tokens) embs = word_to_vec(raw_tokens) if embs is None: return None sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float()
class OvrClassifierWrapper(IModel): def __init__(self, config={}, *args, **kwargs): super(OvrClassifierWrapper, self).__init__(model_class=OvrClassifier, config=config, *args, **kwargs) self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) self.num_words = config.get('num_words', MAX_NUM_WORDS) self.n_classes = config.get('num_classes', 10) self.tokenize_fn = wordpunct_tokenize self.label_encoder = LabelEncoder() def get_state_dict(self): return { 'tokenizer': self.tokenizer, 'config': self.model.config, 'label_encoder': self.label_encoder, 'state_dict': self.model.get_params(), } def load_state_dict(self, state_dict): config = state_dict['config'] # re-initialize model with loaded config self.model = self.init_model() self.model.set_params(state_dict['state_dict']) # load tokenizer self.tokenizer = state_dict['tokenizer'] # load label encoder self.label_encoder = state_dict['label_encoder'] def preprocess_input(self, X): if self.tokenizer is None: self.tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokens = [self.tokenize_fn(sent) for sent in X] tokens = self.tokenizer.texts_to_sequences(tokens) tfidf_matrix = self.tokenizer.sequences_to_matrix(tokens, mode='tfidf') maxlen = max([len(sent) for sent in tokens]) tfidf_weights = np.zeros((len(tokens), maxlen)) for i, seq in enumerate(tokens): for j, token in enumerate(seq): if token < self.tokenizer.num_words: tfidf_weights[i][j] = tfidf_matrix[i][token] # convert from token back to texts # this is to guarantee that tfidf matrix and X has the same length (with oov words ommited) embs = word_to_vec(self.tokenizer.sequences_to_texts(tokens)) sif_emb = SIF_embedding(embs, tfidf_weights, rmpc=0) return torch.from_numpy(sif_emb).float() def preprocess_output(self, y): # One-hot encode outputs # Can also use torch.eye() but leaving as numpy until torch achieves performance parity # lookup = np.eye(self.num_classes) # outputs = np.array([lookup[label] for label in y]) # return torch.from_numpy(outputs).float() return torch.from_numpy(self.label_encoder.transform(y)).long() def infer_predict(self, logits, topk=None): return infer_classification_output(self, logits, topk)