class KerasTokenizer(Tokenizer): def __init__(self, **kwargs): self._keras_tokenizer = KTokenizer(**kwargs) def encode(self, text: str) -> List[int]: return self._keras_tokenizer.texts_to_sequences([text])[0] def decode(self, sequence: List[int]) -> str: return self._keras_tokenizer.sequences_to_texts([sequence])[0] @property def vocab_size(self) -> int: return len(self._keras_tokenizer.word_index) def fit(self, texts: Iterable[str]): self._keras_tokenizer.fit_on_texts(texts) @property def token_index(self) -> Dict[str, int]: return self._keras_tokenizer.word_index
def run(self): X = None for data_file in self.args.data_files: ds = pd.read_csv(data_file, sep=self.args.sep, keep_default_na=False) if X is None: X = ds[self.args.text_field].values else: X = np.append(X, ds[self.args.text_field].values, axis=0) tokenizer = Tokenizer() tokenizer.fit_on_texts(X) if self.args.w2v: X = tokenizer.texts_to_sequences(X) X = tokenizer.sequences_to_texts(X) self.build_fit_w2v(X) else: self.build_embedding(tokenizer.word_index)
class DataHandler: def __init__(self): self.formula_path = os.path.join(config.dataset_path, 'formulas') self.images_path = os.path.join(config.dataset_path, 'images') self.beg_token = '<BOS>' self.end_token = '<EOS>' self.unk_token = '<UNK>' self.tokenizer = None self.__fit_tokenizer() def __fit_tokenizer(self): if os.path.isfile(config.vocab_path): with open(config.vocab_path, 'r') as f: json_content = f.read() self.tokenizer = tokenizer_from_json(json_content) else: tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100 docs = [tmp_doc, self.__read_raw_formulas('train')] num_tokens = config.vocab_size - 3 # for beg, and, unk token self.tokenizer = Tokenizer(num_words=num_tokens, filters='\t\n', lower=False, oov_token=self.unk_token) self.tokenizer.fit_on_texts(docs) with open(config.vocab_path, 'w+') as f: f.write(self.tokenizer.to_json()) def get_path(self, mode): formulas_path = os.path.join(self.formula_path, '{}_formulas.txt'.format(mode)) images_folder = os.path.join(self.images_path, 'images_{}'.format(mode)) return formulas_path, images_folder def __read_raw_formulas(self, mode, split=False): path = self.get_path(mode)[0] try: with open(path, 'r') as f: content = f.read() if split: lines = content.split('\n') if not lines[-1]: lines = lines[:-1] return lines return content except: return [] if split else '' def pad_token(self): return self.tokenizer.word_index[self.end_token] def start_token(self): return self.tokenizer.word_index[self.beg_token] def read_formulas(self, mode): lines = self.__read_raw_formulas(mode, split=True) for i in range(len(lines)): lines[i] = '{} {} {}'.format(self.beg_token, lines[i], self.end_token) result = self.tokenizer.texts_to_sequences(lines) return result def read_images(self, mode, index): dir_path = self.get_path(mode)[1] images_data = [] for i in index: file_path = os.path.join(dir_path, str(i) + '.png') if os.path.isfile(file_path): image = imageio.imread(file_path) images_data.append(image) data = np.array(images_data) data = 255 - data return data def decode_formula(self, sequences): def normalize(formula): start_idx, end_idx = 0, len(formula) if formula[:6] == '<BOS> ': start_idx = 6 try: end_idx = formula.index(self.end_token) except: pass return formula[start_idx:end_idx] sequences_list = sequences.tolist() formulas = self.tokenizer.sequences_to_texts(sequences_list) formulas = [normalize(formula) for formula in formulas] return formulas def plot_sample_sizes(self): lines = self.__read_raw_formulas('train', split=True) training_size = len(lines) lines += self.__read_raw_formulas('validation', split=True) validation_size = len(lines) - training_size print('Training set size: ', training_size) print('Validation set size: ', validation_size) sample_sizes = [] for l in lines: sample_sizes += [len(l)] # the histogram of the data n, bins, patches = plt.hist(sample_sizes, 20, facecolor='g', alpha=0.75) plt.xlabel('length of formula') plt.ylabel('sample size') plt.title('Histogram of Length of formulas') plt.grid(True) plt.show()
class UnsupervisedKmeansAvgBaseModel(UnsupervisedBaseModel): def __init__(self, task): super(UnsupervisedKmeansAvgBaseModel, self).__init__(task) self.num_clusters = 4 # combinations of social and agency self.clf_model = KMeans(init='k-means++', n_clusters=self.num_clusters, n_init=10, random_state=self.args.random_state) def augment_features(self, X_text, X_all_feats): if not self.args.use_allfeats: return X_text X_all = np.concatenate([X_text, X_all_feats[:, 2:]], axis=1) return X_all def train(self, X, y=None): X, y = self.augment_instances(X, y) #X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL]) X_text = X[:, self.args.TEXT_COL] self.max_features = 4000 self.tokenizer = Tokenizer(num_words=self.max_features) self.tokenizer.fit_on_texts(X_text) X_text = self.tokenizer.texts_to_sequences(X_text) X_text = self.tokenizer.sequences_to_texts(X_text) self.text_rep_model = self.build_fit_w2v(X_text) X_text = self.transform_text_to_w2v(self.text_rep_model, X_text) X_all_feats = self.augment_features(X_text, X) pca = PCA(n_components=self.num_clusters, random_state=self.args.random_state) pca.fit(X_all_feats) model = KMeans(init=pca.components_, n_clusters=self.num_clusters, n_init=1, random_state=self.args.random_state) model.fit(X_all_feats) self.clf_model = model def predict(self, X): X_text = X[:, self.args.TEXT_COL] #X_text = self.text_rep_model.transform(X[:, self.args.TEXT_COL]) X_text = self.transform_text_to_w2v(self.text_rep_model, X_text) X_all_feats = self.augment_features(X_text, X) y_pred = self.clf_model.predict(X_all_feats) y = y_pred.astype(np.uint8) y = np.unpackbits(y) y = y.reshape(y_pred.shape[0], 8) y = y[:, -2:] y = y[:, ::-1] return y
s1 = np.array(tokenizer.texts_to_sequences(s1)) s2 = np.array(tokenizer.texts_to_sequences(s2)) print(s1) print(s2) return [s1, s2] w2v = gensim.models.KeyedVectors.load_word2vec_format('./alignment_vec.txt', binary=False) tokenizer = Tokenizer() tokenizer.fit_on_texts(get_vocab('atcgx')) layer_outputs = [layer.output for layer in model.layers[2:]] print(layer_outputs) for layer in layer_outputs: print(layer) activation_model = Model(inputs=model.input, outputs=layer_outputs) activations = activation_model.predict( get_test_alignment(x_valid, 0, 1, tokenizer)) #print(activations) prediction = activations[-1] print(prediction) activation_1 = activations[7] #print(activation_1) for kernel in activation_1[0]: words = np.reshape(kernel, (-1, word_length)) print('Learned alignment motifs:') print(np.array(tokenizer.sequences_to_texts(np.round(words)))) #print(w2v.similar_by_vector(words[0], topn=1)[0])