def get_tokenizer(dataset_config, model_config): with open(dataset_config.vocab, mode="rb") as io: vocab = pickle.load(io) pad_sequence = PadSequence( length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token) ) tokenizer = Tokenizer(vocab=vocab, split_fn=split_morphs, pad_fn=pad_sequence) return tokenizer
def prepare_data(self): pad_sequence = PadSequence(length=self.hparams.length, pad_val=self.vocab.to_indices( self.vocab.padding_token)) tokenizer = Tokenizer(vocab=self.vocab, split_fn=split_to_jamo, pad_fn=pad_sequence) self.tokenizer = tokenizer
import numpy as np import gensim from nltk.corpus import stopwords import tensorflow as tf import tensorflow_hub as hub from model.utils import embedding_metric, Tokenizer, detokenize from torchMoji.api.botmoji import Botmoji from inferSent.api.botsent import Botsent from Toxicity.toxic import NBLogisticRegression, NBTfidfVectorizer, tokenize EPSILON = np.finfo(np.float32).eps ROOT_DIR = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) tokenizer = Tokenizer('spacy') stopwords = stopwords.words('english') question_words = {'who', 'what', 'why', 'where', 'how', 'when'} _ = [stopwords.remove(q) for q in question_words] punct = list(string.punctuation) contractions = ["'s", "'d", "'ld", "n't", "'re", "'ll", "'ve"] filters = set(stopwords + contractions + punct) def _get_emojis(): # All emojis in the order returned by deepmoji EMOJIS = ":joy: :unamused: :weary: :sob: :heart_eyes: :pensive: " + \ ":ok_hand: :blush: :heart: :smirk: :grin: :notes: :flushed: " + \ ":100: :sleeping: :relieved: :relaxed: :raised_hands: " + \ ":two_hearts: :expressionless: :sweat_smile: :pray: " + \ ":confused: :kissing_heart: :heartbeat: :neutral_face: " + \
def get_tokenizer(dataset_config): with open(dataset_config.vocab, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo) return tokenizer
app = Flask(__name__) app.config.from_pyfile("config.py") app.database = create_engine(app.config["DB_URL"], encoding="utf-8", max_overflow=0) # preprocessor & model num_classes = app.config["MODEL"]["num_classes"] max_length = app.config["MODEL"]["length"] with open("model/checkpoint/vocab.pkl", mode="rb") as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=max_length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=split_morphs, pad_fn=pad_sequence) model = SenCNN(num_classes=app.config["MODEL"]["num_classes"], vocab=vocab) ckpt = torch.load("model/checkpoint/best.tar", map_location=torch.device("cpu")) model.load_state_dict(ckpt["model_state_dict"]) model.eval() @app.route("/alive_check", methods=["GET"]) def alive_check(): return "alive", 200 @app.route("/inference", methods=["POST"]) def inference():
help="Directory containing config.json of data") parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo) # model model = ConvRec(num_classes=model_config.num_classes, embedding_dim=model_config.embedding_dim, hidden_dim=model_config.hidden_dim, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform, min_length=model_config.min_length, pad_val=tokenizer.vocab.to_indices(' ')) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True,
def get_tokenizer(dataset_config, split_fn): with open(dataset_config.vocab, mode="rb") as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab, split_fn) return tokenizer
help="name of the data in --data_dir to be evaluate") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=split_to_jamo, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar') model = CharCNN(num_classes=model_config.num_classes, embedding_dim=model_config.embedding_dim, vocab=tokenizer.vocab) model.load_state_dict(checkpoint['model_state_dict']) # evaluation summary_manager = SummaryManager(model_dir) filepath = getattr(data_config, args.data_name) ds = Corpus(filepath, tokenizer.split_and_transform) dl = DataLoader(ds, batch_size=model_config.batch_size, num_workers=4)
"--dataset", default="validation", help="name of the data in --data_dir to be evaluate", ) if __name__ == "__main__": args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(data_dir / "config.json") model_config = Config(model_dir / "config.json") # tokenizer with open(data_config.vocab, mode="rb") as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab, split_morphs) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint("best.tar") model = MaLSTM( num_classes=model_config.num_classes, hidden_dim=model_config.hidden_dim, vocab=tokenizer.vocab, ) model.load_state_dict(checkpoint["model_state_dict"]) # evaluation filepath = getattr(data_config, args.dataset) ds = Corpus(filepath, tokenizer.split_and_transform) dl = DataLoader(ds,
identity = torch.eye(r).to(device) p = torch.norm(sim_mat - identity, dim=(1, 2)).mean() return p if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs) # model model = SAN(num_classes=model_config.num_classes, lstm_hidden_dim=model_config.lstm_hidden_dim, da=model_config.da, r=model_config.r, hidden_dim=model_config.hidden_dim, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4,
help="Directory containing config.json of model") if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=MeCab().morphs, pad_fn=pad_sequence) # model model = SenCNN(num_classes=model_config.num_classes, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=True) val_ds = Corpus(data_config.validation, tokenizer.split_and_transform) val_dl = DataLoader(val_ds, batch_size=model_config.batch_size)
identity = torch.eye(r).to(device) p = torch.norm(sim_mat - identity, dim=(1, 2)).mean() return p if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # tokenizer with open(data_config.vocab, mode='rb') as io: vocab = pickle.load(io) tokenizer = Tokenizer(vocab=vocab, split_fn=split_morphs) # model model = SAN(num_classes=model_config.num_classes, lstm_hidden_dim=model_config.lstm_hidden_dim, da=model_config.da, r=model_config.r, hidden_dim=model_config.hidden_dim, vocab=tokenizer.vocab) # training tr_ds = Corpus(data_config.train, tokenizer.split_and_transform) tr_dl = DataLoader(tr_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4,
help="Directory containing config.json of model", ) if __name__ == "__main__": args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / "config.json") model_config = Config(json_path=model_dir / "config.json") # tokenizer with open(data_config.token_vocab, mode="rb") as io: token_vocab = pickle.load(io) with open(data_config.label_vocab, mode="rb") as io: label_vocab = pickle.load(io) token_tokenizer = Tokenizer(token_vocab, split_to_self) label_tokenizer = Tokenizer(label_vocab, split_to_self) # model model = BilstmCRF(label_vocab, token_vocab, model_config.lstm_hidden_dim) # training tr_ds = Corpus( data_config.train, token_tokenizer.split_and_transform, label_tokenizer.split_and_transform, ) tr_dl = DataLoader( tr_ds, batch_size=model_config.batch_size, shuffle=True,
args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # Vocab and Tokenizer ptr_dir = Path("pretrained") vocab_filepath = ptr_dir / "{}-vocab.pkl".format(args.type) with open(vocab_filepath, mode='rb') as io: vocab = pickle.load(io) ptr_tokenizer = BertTokenizer.from_pretrained(args.type, do_lower_case="uncased" in args.type) ptr_tokenizer = Tokenizer(vocab, ptr_tokenizer.tokenize) preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len) # Load Model config_filepath = ptr_dir / "{}-config.json".format(args.type) config = BertConfig.from_pretrained(config_filepath, output_hidden_states=False) model = BIIN(config, vocab, model_config.hidden_size, enc_num_layers=len(model_config.hidden_size)) # Data Loader tr_ds = Corpus(data_config.tr_path, preprocessor.preprocess,
if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer ptr_tokenizer = BertTokenizer.from_pretrained( 'pretrained/vocab.korean.rawtext.list', do_lower_case=False) with open('pretrained/vocab.pkl', mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) # model (restore) checkpoint_manager = CheckpointManager(model_dir) checkpoint = checkpoint_manager.load_checkpoint(args.restore_file + '.tar') config = BertConfig('pretrained/bert_config.json') model = BertClassifier(config, num_labels=model_config.num_classes, vocab=tokenizer.vocab) model.load_state_dict(checkpoint['model_state_dict']) # evaluation filepath = getattr(data_config, args.data_name) ds = Corpus(filepath, tokenizer.preprocess) dl = DataLoader(ds, batch_size=model_config.batch_size, num_workers=4)