def test_empty_vocab(): """ Nothing is present in an empty word list """ vocab = Vocab([]) assert vocab.as_list() == [] assert not vocab.has("sheep")
def token_emotion_mat(vocab: Vocab): """pass""" emotion_mat = np.zeros(shape=(vocab.size())) emotion_mat[vocab.get_group(vocab.postive_name)] = 1 emotion_mat[vocab.get_group(vocab.negtive_name)] = -1 return emotion_mat
def _load_force_tks(self, force_tks: Union[List, Vocab]): if force_tks is None: return Vocab() elif isinstance(force_tks, List): vocab = Vocab() vocab.add_seq(force_tks) return vocab else: return force_tks
def create_onehot_mat(self, doc_tokens: List[List[Text]]) -> np.array: """pass""" vocab = Vocab() tks = list(set(chain.from_iterable(doc_tokens))) vocab.add_seq(tks) doc_mat = doc_onehot_mat(doc_tokens, vocab) return doc_mat[:-1], vocab
def sample_data(data_path, basedir, specified_index=None): """ Sample format of the processed data from data.py Args: data_path: path for train.p|valid.p """ # global basedir with open(data_path, 'rb') as f: entries = pickle.load(f) # Choose a random sample rand_index = random.randint(0, len(entries)) # Prepare vocab vocab_file = os.path.join(basedir, 'data/processed_reviews/vocab.txt') vocab = Vocab(vocab_file, verbose=False) # Sample (processed_review, review_seq_len, label) = entries[rand_index] print("==> Number of entries:", len(entries)) print("==> Random index:", rand_index) print("==> Processed Review:", processed_review) print("==> Review Len:", review_seq_len) print("==> Label:", label) print("==> See if processed review makes sense:", ids_to_tokens( processed_review, vocab=vocab, ))
def test_small_vocab(): l = ["eeny", "moe", "miney", "meeny"] vocab = Vocab(l) assert vocab.has("moe") assert vocab.has("eeny") assert vocab.has("miney") assert vocab.has("meeny") assert not vocab.has("many") assert sorted(vocab.as_list()) == sorted(l)
def anno_seed_word(self, doc_tokens: List[List[Text]], seed_words: List[Text]) -> List[Example]: """auto annotation for seed words selected through `PMI`, where `so_pmi` would calculated from so_pmi(word) = mean(PMI(word, Pw)) - mean(PMI(word, Nw)) if so_pmi(word) > 0, the seed word would tagged as positive if so_pmi(word) = 0, tagged as neutral if so_pmi(word) < 0, tagged as negative """ _seed_words_vocab = Vocab.gene_from_list(seed_words, Vocab().alters_name, 0) emo_vocab = self.base_pos_words + self.base_neg_words + _seed_words_vocab emo_mat = token_emotion_mat(emo_vocab) doc_mat = doc_onehot_mat(doc_tokens, emo_vocab) so_pmi_score = pair_pmi(doc_mat, emo_mat, emo_vocab) return so_pmi_score
def __init__(self, stop_words: Union[List[Text], Vocab] = None, base_pos_words: Union[List[Text], Vocab] = None, base_neg_words: Union[List[Text], Vocab] = None): """ Parameters ---------- examples : List[Example], token list for each sequence of doc stop_words: Union[List[Text], None], stop words list base_neg_words : Union[List[Text], None], base negative words base_pos_words : Union[List[Text], None], base positive words """ self.stop_words = Vocab.gene_from_list(stop_words, score = 0) if \ isinstance(stop_words, List) else stop_words self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \ isinstance(base_pos_words, List) else base_pos_words self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \ isinstance(base_neg_words, List) else base_neg_words self.seedwords = None
class RuPosIndexer: """Индексирует датасет и хранит словари""" def __init__(self): self.token_vocab = Vocab(lowercase=True, paddings=True) self.pos_vocab = Vocab(paddings=True) self.gram_vocab = Vocab(paddings=True) def index_dataset(self, dataset: List[Sentence]): """ Заполняет словари по датасету """ for sentence in dataset: self.token_vocab.fill(sentence.tokens) self.pos_vocab.fill(sentence.pos_tags) self.gram_vocab.fill(sentence.grammems) def sentence_to_indexes( self, sentence: Sentence) -> Tuple[List[int], List[int], List[int]]: """ Переводит предложение в индексы """ tokens = [self.token_vocab[token] for token in sentence.tokens] pos_tags = [self.pos_vocab[pos] for pos in sentence.pos_tags] grammemes = [self.gram_vocab[gram] for gram in sentence.grammems] return tokens, pos_tags, grammemes
def doc_onehot_mat(doc_tokens: List[List[Text]], vocab: Vocab): """pass""" tk2idx = vocab.tk2idx all_tks = list(tk2idx.keys()) onehot_mat = np.zeros(shape=(vocab.size() + 1, len(doc_tokens)), dtype=np.int8) for id, doc in enumerate(doc_tokens): tks = list(map(lambda tk: tk2idx[tk] if tk in doc else -1, all_tks)) onehot_mat[tks, id] = 1 return onehot_mat
def pair_pmi(doc_mat: Union[np.array, List[List]], emo_mat: Union[np.array, List], vocab: Vocab, if_sign=True) -> List[Example]: """pass""" seed_idx, pos_idx, neg_idx = vocab.get_all_group() doc_mat = np.asarray(doc_mat).squeeze() emo_mat = np.asarray(emo_mat).flatten() if len(doc_mat.shape) != 2: raise ValueError("doc_mat must have dimension of 2") pair_seed_pos_idx = list(product(seed_idx, pos_idx)) pair_seed_neg_idx = list(product(seed_idx, neg_idx)) scores = defaultdict(float) pbar = tqdm(total=len(pair_seed_neg_idx) + len(pair_seed_pos_idx), desc="so pmi annotation calling") for group in [pair_seed_pos_idx, pair_seed_neg_idx]: for seed, emo in group: sub_count = doc_mat[[seed, emo]].sum(axis=1) co_curr = (sub_count == 2).sum() seed_curr, emo_curr = sub_count scores[seed] += pmi_score(co_curr, seed_curr, emo_curr) * emo_mat[emo] pbar.update(1) pbar.close() scores = np.asarray([scores[idx] for idx in seed_idx]) scores = np.sign(scores) if if_sign else scores return [ Example(text=vocab.get_tk(tk), label=sco) for tk, sco in zip(seed_idx, scores) ]
def create_vocab(pos_words: Union[List[Text], Vocab], neg_words: Union[List[Text], Vocab], seed_words: Union[List[Text], Vocab]) -> Vocab: """pass""" _convert_func = lambda x: list(x.tk2idx.keys()) if isinstance(x, Vocab ) else x pos_words = _convert_func(pos_words) neg_words = _convert_func(neg_words) seed_words = _convert_func(seed_words) emo_vocab = Vocab() emo_vocab.add_seq(pos_words, emo_vocab.postive_name, 1) emo_vocab.add_seq(neg_words, emo_vocab.negtive_name, -1) emo_vocab.add_seq(seed_words, emo_vocab.alters_name, 0) return emo_vocab
def anno_mining_token(self, alia_base_emo=True) -> List[Example]: """ use `SO_PMI` and `Doc_Distance` to annotate suspicious mining span tokens where `Doc_Distance` as: doc_dist(w) = NDoc_pos(w) - NDoc_neg(w) Parameters ---------- alia_base_emo: bool, determine whether use base emotion vocab """ alter_tks = [x[0] for x in self.alter_tks] pos_words = Vocab.gene_from_list(self.alia_pos_words, Vocab().postive_name, 1) neg_words = Vocab.gene_from_list(self.alia_neg_words, Vocab().negtive_name, -1) if alia_base_emo: pos_words += self.base_pos_words neg_words += self.base_neg_words emo_vocab = pos_words + neg_words # filter base emo tokens alter_tks = [tk for tk in alter_tks if tk not in emo_vocab.tk2idx] # create each mat emo_vocab += Vocab.gene_from_list(alter_tks, name=Vocab().alters_name, score=0) emo_mat = token_emotion_mat(emo_vocab) label_mat = doc_label_mat(self.doc_labels) doc_mat = doc_onehot_mat(self.doc_tokens, emo_vocab) alter_idx = emo_vocab.get_group(emo_vocab.alters_name) # so_pmi so_pmi_scores_obj = pair_pmi(doc_mat, emo_mat, emo_vocab) so_pmi_scores = [exam.label for exam in so_pmi_scores_obj] # doc_distance doc_dist = np.sum(doc_mat[alter_idx] * label_mat, axis=1) pmi_dist_scores = so_pmi_scores * doc_dist # only score greater than 0 selected res_idx = np.where(pmi_dist_scores > 0)[0] res_exam = [so_pmi_scores_obj[idx] for idx in res_idx] print(f"mining new span token {len(res_exam)}") self.new_tks = res_exam return res_exam
def test_from_simulated_file(): from io import StringIO l = StringIO(initial_value=""" #comment # another comment line sheep rats #comment squirrels """) vocab = Vocab(l) assert sorted(vocab.as_list()) == ["rats", "sheep", "squirrels"] assert vocab.has("sheep") assert vocab.has("rats") assert vocab.has("squirrels") assert not vocab.has("#comment")
def __init__( self, examples: List[Example], seed_tokens: List[Example], extreme_words: Union[List[Text], Vocab], deny_words: Union[List[Text], Vocab], base_pos_words: Union[List[Text], Vocab] = None, base_neg_words: Union[List[Text], Vocab] = None, ): """ Parameters ---------- examples: List[Example], each example could extract `tokens` seed_tokens: List[Example], seed tokens mined extreme_words: List[Text], a set of extreme words deny_words: List[Text], a set of deny words base_pos_words: base positive words if needed base_neg_words: base negative words if needed """ self.doc_tokens = [exam.get("tokens") for exam in examples] self.doc_labels = [exam.label for exam in examples] self.doc_size = len(self.doc_tokens) self.seed_tokens = seed_tokens self.alia_pos_words, self.alia_neg_words = self._alia_emo_words() self.extreme_words = Vocab.gene_from_list( extreme_words, score=2) if isinstance(extreme_words, List) else extreme_words self.deny_words = Vocab.gene_from_list(deny_words) if isinstance( deny_words, List) else deny_words self.span_words = self.extreme_words + self.deny_words # vocab self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \ isinstance(base_pos_words, List) else base_pos_words self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \ isinstance(base_neg_words, List) else base_neg_words self.alter_tks = None self.new_tks = None
### # Globals ### app = flask.Flask(__name__) CONFIG = config.configuration() app.secret_key = CONFIG.SECRET_KEY # Should allow using session variables # # One shared 'Vocab' object, read-only after initialization, # shared by all threads and instances. Otherwise we would have to # store it in the browser and transmit it on each request/response cycle, # or else read it from the file on each request/responce cycle, # neither of which would be suitable for responding keystroke by keystroke. WORDS = Vocab(CONFIG.VOCAB) NUM = min(len(WORDS.as_list()), CONFIG.SUCCESS_AT_COUNT) ### # Pages ### @app.route("/") @app.route("/index") def index(): """The main page of the application""" flask.g.vocab = WORDS.as_list() flask.session["target_count"] = min(len(flask.g.vocab), CONFIG.SUCCESS_AT_COUNT) flask.session["jumble"] = jumbled(flask.g.vocab,
def test_single_vocab(): vocab = Vocab(["moe"]) assert vocab.as_list() == ["moe"] assert vocab.has("moe") assert not vocab.has("meeny")
Patience limit: {args.patience_limit} ##############################\n""", file=stdout) # Fix the seeds for random number generators. if args.seed is not None: fix_random_seeds(args.seed) # Read the data. data_path = args.root + "datasets/%s_en_data/" % args.language print(f"Reading training data from {data_path} ...", file=stdout) (src_train_sents, tgt_train_sents, src_dev_sents, tgt_dev_sents, src_test_sents, tgt_test_sents) = get_data(data_path, args.language) # Build a vocabulary of source and target language. vocab_file = "vocab_%s_en.json" % args.language vocab = Vocab.build(src_train_sents, tgt_train_sents, args.vocab_size, args.freq_cutoff) vocab.save(vocab_file) # Build a model object. model = NMT(word_embed_size=args.word_embed_size, char_embed_size=args.char_embed_size, hidden_size=args.hidden_size, vocab=vocab, dropout_rate=args.dropout_rate, kernel_size=args.kernel_size, padding=args.padding) # Train the model. train_data = list(zip(src_train_sents, tgt_train_sents)) dev_data = list(zip(src_dev_sents, tgt_dev_sents)) dataset = {"train_data": train_data, "dev_data": dev_data}
fitlog.add_hyper({'model': args.w, 'fold': args.fold}) # set cuda config.use_cuda = args.gpu >= 0 and torch.cuda.is_available() if config.use_cuda: torch.cuda.set_device(args.gpu) config.device = torch.device("cuda", args.gpu) else: config.device = torch.device("cpu") logging.info("Use cuda: %s, gpu id: %d.", config.use_cuda, args.gpu) # vocab cache_name = "./save/vocab/" + str(args.fold) + ".pickle" if Path(cache_name).exists(): vocab_file = open(cache_name, 'rb') vocab = pickle.load(vocab_file) logging.info('Load vocab from ' + cache_name + ', words %d, labels %d.' % (vocab.word_size, vocab.label_size)) else: vocab = Vocab(config.train_file) file = open(cache_name, 'wb') pickle.dump(vocab, file) logging.info('Cache vocab to ' + cache_name) # model model = Model(config, vocab) # trainer trainer = Trainer(model, config, vocab, fitlog) trainer.train() trainer.test()
def train(FLAGS, basedir): """ Train a previous or new model. """ # Data paths vocab_path = os.path.join(basedir, 'data/processed_reviews/vocab.txt') train_data_path = os.path.join(basedir, 'data/processed_reviews/train.p') validation_data_path = os.path.join(basedir, 'data/processed_reviews/validation.p') vocab = Vocab(vocab_path) FLAGS.num_classes = 2 # Load embeddings (if using GloVe) if FLAGS.embedding == 'glove': with open(os.path.join(basedir, 'data/processed_reviews/embeddings.p'), 'rb') as f: embeddings = pickle.load(f) FLAGS.vocab_size = len(embeddings) # Start tensorflow session with tf.Session() as sess: # Create|reload model imdb_model = create_model(sess, FLAGS, len(vocab), basedir) # Metrics metrics = { "train_loss": [], "valid_loss": [], "train_acc": [], "valid_acc": [], } # Store attention score history for few samples attn_history = { "sample_0": { "review": None, "label": None, "review_len": None, "attn_scores": [] }, "sample_1": { "review": None, "label": None, "review_len": None, "attn_scores": [] }, "sample_2": { "review": None, "label": None, "review_len": None, "attn_scores": [] }, "sample_3": { "review": None, "label": None, "review_len": None, "attn_scores": [] }, "sample_4": { "review": None, "label": None, "review_len": None, "attn_scores": [] }, } # Start training for train_epoch_num, train_epoch in \ enumerate(generate_epoch( train_data_path, FLAGS.num_epochs, FLAGS.batch_size)): print("==> EPOCH:", train_epoch_num) for train_batch_num, (batch_features, batch_seq_lens) in \ enumerate(train_epoch): batch_reviews, batch_labels = batch_features batch_review_lens, = batch_seq_lens # Display shapes once if (train_epoch_num == 0 and train_batch_num == 0): print("Reviews: ", np.shape(batch_reviews)) print("Labels: ", np.shape(batch_labels)) print("Review lens: ", np.shape(batch_review_lens)) _, train_logits, train_loss, train_acc, lr, attn_scores = \ imdb_model.train( sess=sess, batch_reviews=batch_reviews, batch_labels=batch_labels, batch_review_lens=batch_review_lens, embeddings=embeddings, keep_prob=FLAGS.keep_prob, ) for valid_epoch_num, valid_epoch in \ enumerate(generate_epoch( data_path=validation_data_path, num_epochs=1, batch_size=FLAGS.batch_size, )): for valid_batch_num, (valid_batch_features, valid_batch_seq_lens) in \ enumerate(valid_epoch): valid_batch_reviews, valid_batch_labels = valid_batch_features valid_batch_review_lens, = valid_batch_seq_lens valid_logits, valid_loss, valid_acc = imdb_model.eval( sess=sess, batch_reviews=valid_batch_reviews, batch_labels=valid_batch_labels, batch_review_lens=valid_batch_review_lens, embeddings=embeddings, keep_prob=1.0, # no dropout for val|test ) print ("[EPOCH]: %i, [LR]: %.6e, [TRAIN ACC]: %.3f, [VALID ACC]: %.3f " \ "[TRAIN LOSS]: %.6f, [VALID LOSS]: %.6f" % ( train_epoch_num, lr, train_acc, valid_acc, train_loss, valid_loss)) # Store the metrics metrics["train_loss"].append(train_loss) metrics["valid_loss"].append(valid_loss) metrics["train_acc"].append(train_acc) metrics["valid_acc"].append(valid_acc) # Store attn history for i in range(5): sample = "sample_%i" % i attn_history[sample]["review"] = batch_reviews[i] attn_history[sample]["label"] = batch_labels[i] attn_history[sample]["review_len"] = batch_review_lens[i] attn_history[sample]["attn_scores"].append(attn_scores[i]) # Save the model (maybe) if ((train_epoch_num == (FLAGS.num_epochs - 1)) or ((train_epoch_num % FLAGS.save_every == 0) and (train_epoch_num > 0))): # Make parents ckpt dir if it does not exist if not os.path.isdir( os.path.join(basedir, FLAGS.data_dir, 'ckpt')): os.makedirs(os.path.join(basedir, FLAGS.data_dir, 'ckpt')) # Make child ckpt dir for this specific model if not os.path.isdir(os.path.join(basedir, FLAGS.ckpt_dir)): os.makedirs(os.path.join(basedir, FLAGS.ckpt_dir)) checkpoint_path = \ os.path.join( basedir, FLAGS.ckpt_dir, "%s.ckpt" % FLAGS.model_name) print("==> Saving the model.") imdb_model.saver.save(sess, checkpoint_path, global_step=imdb_model.global_step) # Save the metrics metrics_file = os.path.join(basedir, FLAGS.ckpt_dir, 'metrics.p') with open(metrics_file, 'wb') as f: pickle.dump(metrics, f) # Save the attention scores attn_history_file = os.path.join(basedir, FLAGS.ckpt_dir, 'attn_history.p') with open(attn_history_file, 'wb') as f: pickle.dump(attn_history, f)
### # Globals ### app = flask.Flask(__name__) CONFIG = config.configuration() app.secret_key = CONFIG.SECRET_KEY # Should allow using session variables # # One shared 'Vocab' object, read-only after initialization, # shared by all threads and instances. Otherwise we would have to # store it in the browser and transmit it on each request/response cycle, # or else read it from the file on each request/responce cycle, # neither of which would be suitable for responding keystroke by keystroke. WORDS = Vocab(CONFIG.VOCAB) ### # Pages ### @app.route("/") @app.route("/index") def index(): """The main page of the application""" flask.g.vocab = WORDS.as_list() flask.session["target_count"] = min(len(flask.g.vocab), CONFIG.SUCCESS_AT_COUNT) flask.session["jumble"] = jumbled(flask.g.vocab, flask.session["target_count"])
from src.jtnn_vae import JTNNVAE from src.vocab import Vocab lg = rdkit.RDLogger.logger() lg.setLevel(rdkit.RDLogger.CRITICAL) parser = argparse.ArgumentParser() parser.add_argument('--nsample', type=int, required=True) parser.add_argument('--vocab', required=True) parser.add_argument('--model', required=True) parser.add_argument('--output', required=True) parser.add_argument('--config', required=True) args = parser.parse_args() config = load_json_config(args.config) vocab = get_vocab(args.vocab) vocab = Vocab(vocab) model = JTNNVAE(vocab, config['hidden_size'], config['latent_size'], config['depthT'], config['depthG']) train_model_params = paddle.load(args.model) model.set_state_dict(train_model_params) model.eval() res = [] for i in range(args.nsample): smi = model.sample_prior() print(i, smi) res.append(smi) with open(args.output, 'w') as f: for smi in res: f.write(smi + '\n')
def __init__(self): self.token_vocab = Vocab(lowercase=True, paddings=True) self.pos_vocab = Vocab(paddings=True) self.gram_vocab = Vocab(paddings=True)
fold2data(fold_num) # convert each fold data for fold in range(9, fold_num): cache_name = "./save/vocab/" + str(fold) + ".pickle" train = "train_" + str(fold) dev = "dev_" + str(fold) files = [train, dev] # biuld vocab if Path(cache_name).exists(): vocab_file = open(cache_name, 'rb') vocab = pickle.load(vocab_file) vocab_name = "./save/vocab/vocab.txt" vocab.dump(vocab_name) print('Load vocab from ' + cache_name) else: vocab = Vocab('./data/' + train + '.pickle') file = open(cache_name, 'wb') pickle.dump(vocab, file) print('Save vocab to ' + cache_name) for file in files: pass # data 2 word2vec # convert_data_word2vec(file) # data 2 bert # convert_data_bert_pretrain(train)
def _truncate(val: np.array, trun_count: int, vocab: Vocab = None): sort_index = np.argsort(val, axis=None)[::-1][:trun_count] if vocab: return vocab.get_tks(sort_index) else: return sort_index
# test for load data from src.utils import read_data_from_csv, read_line_from_txt, Example from src.vocab import Vocab dataset = read_data_from_csv("../corpus/weibo_senti_100k.csv", label_map={ "1": 1, "0": -1 }) dataset = dataset[:10] + dataset[-10:] dataset_size = len(dataset) stop_word_dict = read_line_from_txt("../dict/stopword.txt") base_posword_dict = read_line_from_txt("../dict/ntusd/NTUSD_positive.txt") base_negword_dict = read_line_from_txt("../dict/ntusd/NTUSD_negative.txt") extreme_words = read_line_from_txt("../dict/hownet/extreme.txt") extreme_word_dict = Vocab() for word in extreme_words: if not word.startswith("-"): _lb, _tt = word.strip().split(',') _lb = int(_lb) extreme_word_dict.add(_tt, _lb) deny_words_dict = read_line_from_txt("../dict/deny.txt") # ================= # # test for label\emoji distribution # from src.utils import label_distribution_viewer, emoji_distribution_viewer # label_distribution_viewer(dataset, label_map={"1": "pos", "0": "neg"}, verbose=True) # emoji_distribution_viewer(dataset, drop_df = 5) # ================= # test normalizer