def main(args): print(args) if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) if n is not None: dev_data = dev_data[:n] num_classes = len(set(dev_labels)) print("num_classes:", num_classes) semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn=rnn, pre_computed_patterns=None) if args.gpu: print("Cuda!") model.to_cuda(model) state_dict = torch.load(args.input_model) else: state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage) # Loading model model.load_state_dict(state_dict) interpret_documents(model, args.batch_size, dev_data, dev_text, args.ofile, args.max_doc_len) return 0
def test_read_embeddings(self): """ Tests that `data.read_embeddings` works for a small file """ max_vocab_size = 4 vocab, vecs, dim = read_embeddings(EMBEDDINGS_FILENAME, max_vocab_size=max_vocab_size) self.assertEqual(len(vocab), max_vocab_size + 2) self.assertEqual(dim, 50) self.assertEqual(vocab(UNK_TOKEN), 0) self.assertEqual(vocab("the"), 3) self.assertEqual(vocab(","), 4) self.assertAlmostEqualList( list(vecs[vocab("the")])[:10], [ 0.0841414179717338836, 0.050259400093738082, -0.08301819043038873, 0.024497632935789507, 0.069501213835168801, -0.0089489832984913243, -0.10001958794687831, -0.035955359038543321, -0.00013290116839109539, -0.13217046660344609 ], 7) self.assertAlmostEqualList( list(vecs[vocab(",")])[:10], [ 0.0030016593815816841, 0.052886911297237889, -0.037739038679673299, 0.091452238178075698, 0.14250568295327015, 0.10654428051177782, -0.095697572962977706, -0.12425811297566139, -0.081288893303752177, -0.05345861340398955 ], 7)
def getembeddings(srcpath, trgpath, compath, cutoff=50000): ts = '/home/15CS10013/important-sai/ts12' tsdata = ts + '/tsdata' compath = tsdata + '/fk.lower.vec' srcpath = tsdata + '/fkdifficpart.lower.vec.id' trgpath = tsdata + '/fkeasypart.lower.vec.id' vocabcom = data.read_embeddings(open(compath), vocabonly=True) vocabsrc = data.read_embeddings(open(srcpath), vocabonly=True) vocabtrg = data.read_embeddings(open(trgpath), vocabonly=True) vocabcom = set(vocabcom.id2word[1:]) vocabsrc = set(vocabsrc.id2word[1:]) vocabtrg = set(vocabtrg.id2word[1:]) vocabinter = vocabcom & vocabsrc & vocabtrg embeddcom, vocabcom = data.read_embeddings(open(compath), vocabulary=vocabinter) embeddsrc, vocabsrc = data.read_embeddings(open(srcpath), vocabulary=vocabinter) embeddtrg, vocabtrg = data.read_embeddings(open(trgpath), vocabulary=vocabinter) saveembedds(embeddsrc, vocabsrc, tsdata + '/fkeasypart.lower.vec.id.com') saveembedds(embeddtrg, vocabtrg, tsdata + '/fkdifficpart.lower.vec.id.com') saveembedds(embeddcom, vocabcom, tsdata + '/fk.lower.vec.com') # embeddsrccom = nn.Embedding(embeddsrc.weight.data.size(0),embeddsrc.weight.data.size(1)+embeddcom.weight.data.size(1)) # embeddsrccom.weight.data = torch.cat([embeddsrc.weight.data,embeddcom.weight.data],dim=1) # embeddtrgcom = nn.Embedding(embeddtrg.weight.data.size(0),embeddtrg.weight.data.size(1)+embeddcom.weight.data.size(1)) # embeddtrgcom.weight.data = torch.cat([embeddtrg.weight.data,embeddcom.weight.data],dim=1) return (embeddsrccom, vocabsrc), (embeddtrgcom, vocabtrg)
def main(): n = None mlp_hidden_dim = 25 num_mlp_layers = 2 validation_data_file = "./soft_patterns/data/test.data" dev_vocab = vocab_from_text(validation_data_file) print("Dev vocab size:", len(dev_vocab)) embedding_file = "./soft_patterns/glove.6B.50d.txt" vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab) seed = 100 torch.manual_seed(seed) np.random.seed(seed) patterns = "5-50_4-50_3-50_2-50" pattern_specs = OrderedDict( sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")), key=lambda t: t[0])) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(validation_data_file, vocab, num_padding_tokens=num_padding_tokens) validation_label_file = "./soft_patterns/data/test.labels" dev_labels = read_labels(validation_label_file) dev_data = list(zip(dev_input, dev_labels)) num_classes = len(set(dev_labels)) print("num_classes:", num_classes) semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity) rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, 0.1, False, rnn, None, False, 0, False, None, None) input_model = "./soft_patterns/output/model_9.pth" state_dict = torch.load(input_model, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) test_acc = evaluate_accuracy(model, dev_data, 1, False) print("Test accuracy: {:>8,.3f}%".format(100 * test_acc)) return 0
def main(args): print(args) n = args.num_train_instances if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) dev_vocab = vocab_from_text(args.vd) print("Dev vocab:", len(dev_vocab)) train_vocab = vocab_from_text(args.td) print("Train vocab:", len(train_vocab)) dev_vocab |= train_vocab vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) train_input, _ = read_docs(args.td, vocab, num_padding_tokens=num_padding_tokens) train_labels = read_labels(args.tl) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) # truncate data (to debug faster) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) if n is not None: train_data = train_data[:n] dev_data = dev_data[:n] dropout = None if args.td is None else args.dropout # TODO: GRU doesn't work yet cell_type = LSTM # GRU if args.gru else LSTM model = AveragingRnnClassifier(args.hidden_dim, args.mlp_hidden_dim, args.num_mlp_layers, num_classes, embeddings, cell_type=cell_type, gpu=args.gpu) if args.gpu: model.to_cuda(model) model_file_prefix = 'model' # Loading model if args.input_model is not None: state_dict = torch.load(args.input_model) model.load_state_dict(state_dict) model_file_prefix = 'model_retrained' model_save_dir = args.model_save_dir if model_save_dir is not None: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, args.num_iterations, model_file_prefix, args.learning_rate, args.batch_size, args.scheduler, gpu=args.gpu, clip=args.clip, debug=args.debug, dropout=dropout, word_dropout=args.word_dropout, patience=args.patience)
def main(args): print(args) pattern_specs = OrderedDict( sorted( ([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) pre_computed_patterns = None if args.pre_computed_patterns is not None: pre_computed_patterns = read_patterns(args.pre_computed_patterns, pattern_specs) pattern_specs = OrderedDict( sorted(pattern_specs.items(), key=lambda t: t[0])) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) train_vocab = vocab_from_text(args.td) print("Train vocab size:", len(train_vocab)) dev_vocab |= train_vocab vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) num_iterations = args.num_iterations train_input, _ = read_docs(args.td, vocab, num_padding_tokens=num_padding_tokens) train_labels = read_labels(args.tl) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) # truncate data (to debug faster) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) if n is not None: train_data = train_data[:n] dev_data = dev_data[:n] if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn, pre_computed_patterns, args.no_sl, args.shared_sl, args.no_eps, args.eps_scale, args.self_loop_scale) if args.gpu: model.to_cuda(model) model_file_prefix = 'model' # Loading model if args.input_model is not None: state_dict = torch.load(args.input_model) model.load_state_dict(state_dict) model_file_prefix = 'model_retrained' model_save_dir = args.model_save_dir if model_save_dir is not None: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, num_iterations, model_file_prefix, args.learning_rate, args.batch_size, args.scheduler, args.gpu, args.clip, args.max_doc_len, args.debug, args.dropout, args.word_dropout, args.patience) return 0
args = parser.parse_args() p = model.Predictor() p.load_model(args.model_file) print(p.word_emb_dim(), file=sys.stderr) #l=p.model.get_layer("emb_word") #print("EMB LAYER CONFIG",p.get_config()["batch_input_shape"]) try: word_emb_length, word_emb_dim = p.word_emb_dim() assert instanceof(word_emb_length, int) and word_emb_length > 2 except: word_emb_length = p.model.get_layer("emb_word").get_config()[ "input_dim"] #some older saved models don't have word_emb_dim() word_emb_dim = None word_embeddings = data.read_embeddings( args.embeddings, word_emb_length - 2) #-2 because two dimensions will be added del word_embeddings.vectors #we should never need these, we are only after the vocabulary here, really #print(p.model.summary(),file=sys.stderr) print("wordlen/model", p.word_seq_len(), file=sys.stderr) if args.errstats: err = {} else: err = None correct = 0 total = 0 print("INPUTFILES:", args.inputfiles, file=sys.stderr) if not args.inputfiles:
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Translate using a pre-trained model') parser.add_argument('--model', help='a model previously trained with train.py') parser.add_argument('--batch_size', type=int, default=50, help='the batch size (defaults to 50)') parser.add_argument('--beam_size', type=int, default=12, help='the beam size (defaults to 12, 0 for greedy search)') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the input file (defaults to stdin)') parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='the output file (defaults to stdout)') parser.add_argument('--noise',type=float,default=0.5) parser.add_argument('--pass_att',action='store_true',default=False) parser.add_argument('--src_embeddings',default=None,help='common intersection source embeddings') parser.add_argument('--cutoff', type=int, default=None, help='cutoff for source embeddings above') parser.add_argument('--cat_embedds',help='use torch.load to load src and trg ') parser.add_argument('--ncontrol',type=int,default=0,help='control number given while using the decoder') args = parser.parse_args() t = torch.load(args.model) # try: # t = torch.load(args.model) # except Exception: # # t = torch.load(args.model,map_location={'cuda:1':'cuda:0'}) # t = torch.load(args.model,map_location={'cuda:3'}) # Translate sentences end = False fin = open(args.input, encoding=args.encoding, errors='surrogateescape') fout = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') if args.src_embeddings is not None: encoder_embeddings,src_dictionary = data.read_embeddings(open(args.src_embeddings,'r'),threshold=args.cutoff) encoder_embeddings = gpu(encoder_embeddings) t.decoder_embeddings=gpu(t.decoder_embeddings) t.generator=gpu(t.generator) t.encoder=gpu(t.encoder) t.decoder=gpu(t.decoder) translator_new = Translator(encoder_embeddings,t.decoder_embeddings,t.generator,src_dictionary,\ t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device) else: t.device=torch.device('cuda') t.encoder=gpu(t.encoder) t.decoder=gpu(t.decoder) t.encoder_embeddings=gpu(t.encoder_embeddings) t.decoder_embeddings=gpu(t.decoder_embeddings) t.generator=gpu(t.generator) t.src_dictionary = data.Dictionary(t.src_dictionary.id2word[1:]) t.trg_dictionary = data.Dictionary(t.trg_dictionary.id2word[1:]) translator_new = Translator(t.encoder_embeddings,t.decoder_embeddings,t.generator,t.src_dictionary,\ t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device) # print (translator_new.denoising) # exit(0) while not end: batch = [] while len(batch) < args.batch_size and not end: line = fin.readline() if not line: end = True else: batch.append(line) if args.beam_size <= 0 and len(batch) > 0: for translation in translator_new.greedy(batch, train=False): print(translation, file=fout) elif len(batch) > 0: translations = translator_new.beam_search(batch, train=False, beam_size=12, max_ratio=2,rnk=6,noiseratio=args.noise,pass_att=args.pass_att,ncontrol=args.ncontrol if args.ncontrol!=0 else None) print(translations) if args.pass_att: for translation1,trans2 in translations: print(translation1,trans2, file=fout) else: for translation in translations: print(translation, file=fout) fout.flush() fin.close() fout.close()
def main(args): print(args) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) if args.dan or args.bilstm: num_padding_tokens = 1 elif args.cnn: num_padding_tokens = args.window_size - 1 else: pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) if n is not None: dev_data = dev_data[:n] num_classes = len(set(dev_labels)) print("num_classes:", num_classes) if args.dan: model = DanClassifier(mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, args.gpu) elif args.bilstm: cell_type = LSTM model = AveragingRnnClassifier(args.hidden_dim, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, cell_type=cell_type, gpu=args.gpu) elif args.cnn: model = PooledCnnClassifier(args.window_size, args.num_cnn_layers, args.cnn_hidden_dim, num_mlp_layers, mlp_hidden_dim, num_classes, embeddings, pooling=max_pool_seq, gpu=args.gpu) else: semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn, None, args.no_sl, args.shared_sl, args.no_eps, args.eps_scale, args.self_loop_scale) if args.gpu: state_dict = torch.load(args.input_model) else: state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) if args.gpu: model.to_cuda(model) test_acc = evaluate_accuracy(model, dev_data, args.batch_size, args.gpu) print("Test accuracy: {:>8,.3f}%".format(100*test_acc)) return 0
#words index #train_df['text'] = train_df['text'].apply(build_index) words_index = get_words_index(train_df['text'], build=False, save=False) reverse_words_index = {v: k for k, v in words_index.items()} print(words_index) print("-----") print(reverse_words_index) print(words_index.items()) glove_path = "/Users/Ricou/Desktop/ANDRE/machine_learning/tweet_sentiment_extraction/data/glove.twitter.27B.25d.txt" #Embeddings z = read_embeddings( "/Users/Ricou/Desktop/ANDRE/machine_learning/tweet_sentiment_extraction/data/glove.twitter.27B.25d.txt", index=words_index, build=False, save=False) print(z) #sequences sequences = build_sequences(train_df['text'], reverse_words_index) print(np.max([len(s) for s in sequences])) print(np.mean([len(s) for s in sequences])) # Padding T = 8 x = pad_sequences(sequences, maxlen=T, padding="post") #Keras models
parser.add_argument('--classname', default="DocuClassifier", help='Name of class in model.py') parser.add_argument('--word-seq-len', type=int, default=400, help='Name of class in model.py') parser.add_argument('--model-file', help='file-name-prefix to save to') #parser.add_argument('--like', help='train a new model like this one, pick parameters from the name') args = parser.parse_args() word_seq_len = args.word_seq_len word_vec = data.read_embeddings(args.word_embeddings, args.maxrank_emb) lemma_vec = data.read_embeddings(args.lemma_embeddings, args.maxrank_emb) train_documents = split_data.read_docs(open(args.train_file)) dev_documents = split_data.read_docs(open(args.devel_file)) test_documents = split_data.read_docs(open(args.test_file)) train_labels = [item[0] for item in train_documents] dev_labels = [item[0] for item in dev_documents] test_labels = [item[0] for item in test_documents] train_labels_numeric, label_encoders = data.vectorize_labels( train_labels, dict( (k, None) for k in ("joint", "aspect", "sentiment"))) #train_labels_numeric is a dictionary with joint,aspect,sentiment as keys and the encoded labels as values dev_labels_numeric, _ = data.vectorize_labels(dev_labels, label_encoders)
parser.add_argument('--train-file', help='.conllu') parser.add_argument('--devel-file', help='.conllu') parser.add_argument('--dicts-file', help='.json') parser.add_argument('--embeddings', help='.vector or .bin') parser.add_argument('--maxrank-emb', type=int, default=100000, help='Max rank of the embedding') parser.add_argument('--classname', help='Name of class in model.py') parser.add_argument( '--like', help='train a new model like this one, pick parameters from the name') parser.add_argument('--model-file', help='file-name-prefix to save to') args = parser.parse_args() word_embeddings = data.read_embeddings(args.embeddings, args.maxrank_emb) with open(args.train_file) as f: train_conllu = data.read_conll(f) inputs_train_dict, outputs_train_dict, output_features = data.prep_data( train_conllu, args.dicts_file, word_embeddings.vocab, word_seq_len=None, shuffle=True) word_seq_len = inputs_train_dict["inp_char_seq"].shape[1] with open(args.devel_file) as f: devel_conllu = data.read_conll(f) inputs_devel_dict, outputs_devel_dict, output_features_dev = data.prep_data( devel_conllu, args.dicts_file,
def main(): patterns = "5-50_4-50_3-50_2-50" pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")), key=lambda t: t[0])) pre_computed_patterns = None n = None mlp_hidden_dim = 25 num_mlp_layers = 2 seed = 100 #Sets the seed for generating random numbers. torch.manual_seed(seed) #This method is called when RandomState is initialized. np.random.seed(seed) validation_data_file = "./soft_patterns/data/dev.data" dev_vocab = vocab_from_text(validation_data_file) # print(dev_vocab.index) print("Dev vocab size:", len(dev_vocab)) # exit(0) train_data_file = "./soft_patterns/data/train.data" train_vocab = vocab_from_text(train_data_file) print("Train vocab size:", len(train_vocab)) dev_vocab |= train_vocab embedding_file='./soft_patterns/glove.6B.50d.txt' vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(validation_data_file, vocab, num_padding_tokens=num_padding_tokens) validation_label_file = "./soft_patterns/data/dev.labels" dev_labels = read_labels(validation_label_file) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) num_iterations = 10 train_input, _ = read_docs(train_data_file, vocab, num_padding_tokens=num_padding_tokens) train_labels_file = "./soft_patterns/data/train.labels" train_labels = read_labels(train_labels_file) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) rnn = None semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity) model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, 0.1, False, rnn, pre_computed_patterns, False, 0, False, None, None) model_file_prefix = "model" model_save_dir = "./soft_patterns/output/" print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, num_iterations, model_file_prefix, 0.001, 1, False, False, None, -1,0,0,0, 30) return 0