def _pre_process(self): self.options = train.config() with open('./out/options.pkl', 'rb') as f: opt = pickle.load(f) self.options.__dict__.update(opt) self.options.batch_size = 1 vocab_file = './data/vocab.txt' self.data_tools = data_process.Data(vocab_file, None, self.options, logging) self.tokenizer = utils.Tokenizer(logging)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--textstring', help='the text you want to generate', default='Generating text', type=str) parser.add_argument('--writersource', help="path of the image of the desired writer, (e.g. './assets/image.png' \ will use random from ./assets if unspecified", default=None) parser.add_argument('--name', help="path for generated image (e.g. './assets/sample.png'), \ will not be saved if unspecified", default=None) parser.add_argument('--diffmode', help="what kind of y_t-1 prediction to use, use 'standard' for \ Eq 9 in paper, will default to prediction in Eq 12", default='new', type=str) parser.add_argument('--show', help="whether to show the sample (popup from matplotlib)", default=False, type=bool) parser.add_argument('--weights', help='the path of the loaded weights', default='./weights/model_weights.h5', type=str) parser.add_argument('--seqlen', help='number of timesteps in generated sequence, default 16 * length of text', default=None, type=int) parser.add_argument('--num_attlayers', help='number of attentional layers at lowest resolution, \ only change this if loaded model was trained with that hyperparameter', default=2, type=int) parser.add_argument('--channels', help='number of channels at lowest resolution, only change \ this if loaded model was trained with that hyperparameter', default=128, type=int) args = parser.parse_args() timesteps = len(args.textstring) * 16 if args.seqlen is None else args.seqlen timesteps = timesteps - (timesteps%8) + 8 #must be divisible by 8 due to downsampling layers if args.writersource is None: assetdir = os.listdir('./assets') sourcename = './assets/' + assetdir[np.random.randint(0, len(assetdir))] else: sourcename = args.writersource L = 60 tokenizer = utils.Tokenizer() beta_set = utils.get_beta_set() alpha_set = tf.math.cumprod(1-beta_set) C1 = args.channels C2 = C1 * 3//2 C3 = C1 * 2 style_extractor = nn.StyleExtractor() model = nn.DiffusionWriter(num_layers=args.num_attlayers, c1=C1, c2=C2, c3=C3) _stroke = tf.random.normal([1, 400, 2]) _text = tf.random.uniform([1, 40], dtype=tf.int32, maxval=50) _noise = tf.random.uniform([1, 1]) _style_vector = tf.random.normal([1, 14, 1280]) _ = model(_stroke, _text, _noise, _style_vector) #we have to call the model on input first model.load_weights(args.weights) writer_img = tf.expand_dims(preprocessing.read_img(sourcename, 96), 0) style_vector = style_extractor(writer_img) utils.run_batch_inference(model, beta_set, args.textstring, style_vector, tokenizer=tokenizer, time_steps=timesteps, diffusion_mode=args.diffmode, show_samples=args.show, path=args.name)
def __init__(self, hps, data_directory): self.limit = 1000 if not hps.use_continuous_data and hps.token_type == 'dictionary': self.tokenizer = utils.Tokenizer(hps.tokenizer_dict_file, max_seq_len=0) elif not hps.use_continuous_data and hps.token_type == 'grid': self.tokenizer = utils.GridTokenizer(resolution=100) meta_file = [f for f in glob.glob("{}/*".format(data_directory)) if os.path.basename(f).startswith('meta')][0] meta_dict = np.load(meta_file, allow_pickle=True) self.n_classes = int(meta_dict['n_classes']) self.n_samples = int(meta_dict['n_samples_train']) self.class_names = meta_dict['class_names'] self.scale_factor = float(meta_dict['std']) super().__init__(hps, data_directory)
# Default settings --------------------------------------------------------- set_defaults() # Print the configurations args_str = 'Experiment Configuration\n' for k in vars(args): args_str += f' - {k[:30]}'.ljust(35) + f'{getattr(args, k)}\n' logger.debug(args_str) # Initialize tokenizer and set the special tokens TokB = BertTokenizer.from_pretrained('bert-base-uncased') TokC = None spt_ids_B, spt_ids_C, eos_mapping = \ utils.get_special_tokens(bert_tokenizer=TokB) if args.model_type == 'abs': TokC = utils.Tokenizer(vocab_size=args.vocab_size) TokC.from_pretrained(args.file_dec_emb) # Model and Criterion ------------------------------------------------------ if args.model_type == 'rel': model = DocRelClassifier(bert_model=args.bert_model).cuda() criterion = TASummEncLoss(pos_weight=args.crit_pos_weight, reduction='mean') elif args.model_type == 'ext': model = ExtractiveClassifier(args).cuda() criterion = TASummEncLoss(pos_weight=args.crit_pos_weight) elif args.model_type == 'abs': model = AbstractiveSummarizer(args).cuda() if args.file_trained_ext is not None: model.load_ext_model(args.file_trained_ext) criterion = TASummDecLoss(model.generator, 0, model.decoder.vocab_size)
pos_unigram_lm = NgramModel(1, pos_movie_reviews.words(), estimator) print "Positive unigram model complete." pos_bigram_lm = NgramModel(2, pos_movie_reviews.words(), estimator) print "Positive bigram model complete." #pos_trigram_lm = NgramModel(3, pos_movie_reviews.words(), estimator) neg_unigram_lm = NgramModel(1, neg_movie_reviews.words(), estimator) print "Negative unigram model complete." neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator) print "Negative bigram model complete." #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator) #read in the tweets tweets = [] tokenizer = utils.Tokenizer() neg_review_higher = 0 pos_review_higher = 0 with open(sys.argv[2], 'r') as tweets_file: tweets.extend(tweets_file.readlines()) for tweet in tweets: tokens = tokenizer.tokenize(tweet) pu = pos_unigram_lm.perplexity(tokens) nu = neg_unigram_lm.perplexity(tokens) pb = pos_bigram_lm.perplexity(tokens) nb = neg_bigram_lm.perplexity(tokens) #pt = pos_trigram_lm.perplexity(tokens) #nt = neg_trigram_lm.perplexity(tokens)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--steps', help='number of trainsteps, default 60k', default=60000, type=int) parser.add_argument('--batchsize', help='default 96', default=96, type=int) parser.add_argument('--seqlen', help='sequence length during training, default 480', default=480, type=int) parser.add_argument('--textlen', help='text length during training, default 50', default=50, type=int) parser.add_argument('--width', help='offline image width, default 1400', default=1400, type=int) parser.add_argument('--warmup', help='number of warmup steps, default 10k', default=10000, type=int) parser.add_argument('--dropout', help='dropout rate, default 0', default=0.0, type=float) parser.add_argument( '--num_attlayers', help='number of attentional layers at lowest resolution', default=2, type=int) parser.add_argument('--channels', help='number of channels in first layer, default 128', default=128, type=int) parser.add_argument('--print_every', help='show train loss every n iters', default=1000, type=int) parser.add_argument('--save_every', help='save ckpt every n iters', default=10000, type=int) args = parser.parse_args() NUM_STEPS = args.steps BATCH_SIZE = args.batchsize MAX_SEQ_LEN = args.seqlen MAX_TEXT_LEN = args.textlen WIDTH = args.width DROP_RATE = args.dropout NUM_ATTLAYERS = args.num_attlayers WARMUP_STEPS = args.warmup PRINT_EVERY = args.print_every SAVE_EVERY = args.save_every C1 = args.channels C2 = C1 * 3 // 2 C3 = C1 * 2 MAX_SEQ_LEN = MAX_SEQ_LEN - (MAX_SEQ_LEN % 8) + 8 BUFFER_SIZE = 3000 L = 60 tokenizer = utils.Tokenizer() beta_set = utils.get_beta_set() alpha_set = tf.math.cumprod(1 - beta_set) style_extractor = nn.StyleExtractor() model = nn.DiffusionWriter(num_layers=NUM_ATTLAYERS, c1=C1, c2=C2, c3=C3, drop_rate=DROP_RATE) lr = nn.InvSqrtSchedule(C3, warmup_steps=WARMUP_STEPS) optimizer = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.98, clipnorm=100) path = './data/train_strokes.p' strokes, texts, samples = utils.preprocess_data(path, MAX_TEXT_LEN, MAX_SEQ_LEN, WIDTH, 96) dataset = utils.create_dataset(strokes, texts, samples, style_extractor, BATCH_SIZE, BUFFER_SIZE) train(dataset, NUM_STEPS, model, optimizer, alpha_set, PRINT_EVERY, SAVE_EVERY)
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from param import args import numpy as np def check(ar): ar = ar.cpu().detach().numpy() return np.any(np.isnan(ar)) def check2(ar): # ar = ar.cpu().numpy() return np.any(np.isnan(ar)) import utils TRAIN_VOCAB = '../tasks/R2R/data/train_vocab.txt' vocab = utils.read_vocab(TRAIN_VOCAB) tok = utils.Tokenizer(vocab=vocab, encoding_length=args.maxInput) # class EncoderLSTM(nn.Module): ''' Encodes navigation instructions, returning hidden state context (for attention methods) and a decoder initial state. ''' def __init__(self, vocab_size, embedding_size, hidden_size, padding_idx, dropout_ratio, bidirectional=False, num_layers=1): super(EncoderLSTM, self).__init__() self.embedding_size = embedding_size self.hidden_size = hidden_size self.drop = nn.Dropout(p=dropout_ratio) if bidirectional: print("Using Bidir in EncoderLSTM") self.num_directions = 2 if bidirectional else 1
default=233, help="random seed for initialization") args = parser.parse_args() logger.info('Args: {}'.format(args)) config = utils.Config(args.model_dir) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') utils.set_seed(args.seed) processor = utils.MsraNerProcessor(args.data_dir) train_examples = processor.get_train_examples() dev_examples = processor.get_test_examples() label2idx = processor.get_label2id() tokenizer = utils.Tokenizer(args.model_dir) train_features = utils.examples_to_ids(train_examples, tokenizer, label2idx, args.max_len) all_label = [] (all_label.extend(f.token_label_ids) for f in train_features) logger.info(np.bincount(all_label)) dev_features = utils.examples_to_ids(dev_examples, tokenizer, label2idx, args.max_len) train_dataloader = DataLoader(utils.MyDataset(train_features), batch_size=args.train_batch_size, shuffle=True, collate_fn=utils.TokenClfCollate()) dev_dataloader = DataLoader(utils.MyDataset(dev_features), batch_size=args.eval_batch_size, shuffle=False,
# Purpose : # Creation Date : 09-18-2012 # Last Modified : Tue 02 Oct 2012 04:53:17 PM MDT # Created By : Nathan Gilbert # import sys import utils import operator if __name__ == "__main__": if len(sys.argv) < 2: print "Usage: %s <tweets> <outfile>" % (sys.argv[0]) sys.exit(1) unigrams = {} tok = utils.Tokenizer(preserve_case=False) with open(sys.argv[1], 'r') as inFile: for tweet in inFile: #each line is a tweet tweet = tweet.strip() #split the words tokens = tok.tokenize(tweet) #print tokens for t in tokens: unigrams[t] = unigrams.get(t, 0) + 1 #print len(unigrams.keys()) features = map( lambda x: x[0],