def getSentences(document_id): document = get_document(document_id) sentences = [] for sentence in document.sentences: sentences.append( {"id": str(sentence.id), "source": sentence.source, "translation": sentence.translation, "beam": sentence.beam, "score": sentence.score, "attention": sentence.attention, "corrected": sentence.corrected, "flagged": sentence.flagged, "diff": sentence.diff if hasattr(sentence, "diff") else ""}) old_etag = request.headers.get('If-None-Match', '') data = json.dumps(sentences) new_etag = md5(data.encode("utf-8")).hexdigest() if old_etag == new_etag: return "", 304 else: res = jsonify(sentences) res.headers["Etag"] = new_etag return res
def split_and_write(file1, data1, file2, data2): with open(file1, "at") as f1: with open(file2, "at") as f2: # z = match(data1, data2) # if z > 0.05: # d1_count, d1_cleaned = clean_and_split(data1) # d2_count, d2_cleaned = clean_and_split(data2) # if (d1_count == d2_count): # f1.write(encode(d1_cleaned)) # f2.write(encode(d2_cleaned)) # else: # f1.write(encode(clean(data1))) # f2.write(encode(clean(data2))) # else: # print("'{}' unlike '{}'\n".format(data1, data2)) f1.write(encode(clean(data1))) f2.write(encode(clean(data2)))
def write_saves_dat(self): import time try: buf = data.encode(self.root) except Exception as e: QtGui.QMessageBox.warning(self, 'MiasMod', '%s while encoding data\n\n%s\n\nRefusing to write saves.dat!\n\nThis means there is a bug in MiasMod, please report this to DarkStarSword!' \ % (e.__class__.__name__, str(e))) return if not self.verify(buf): QtGui.QMessageBox.warning(self, 'MiasMod', 'Verification pass failed, refusing to write saves.dat!\n\nThis means there is a bug in MiasMod, please report this to DarkStarSword!' \ % (e.__class__.__name__, str(e))) return try: timestamp_str = time.strftime('%Y%m%d%H%M%S') backup = '%s~%s' % (self.save_path, timestamp_str) os.rename(self.save_path, backup) except Exception as e: QtGui.QMessageBox.warning(self, 'MiasMod', '%s while backing up saves.dat\n\n%s\n\nRefusing to write saves.dat!' \ % (e.__class__.__name__, str(e))) return try: open(self.save_path, 'wb').write(buf) except Exception as e: QtGui.QMessageBox.warning(self, 'MiasMod', '%s while writing saves.dat\n\n%s\n\nWill attempt to restore backup %s...' \ % (e.__class__.__name__, str(e), backup)) try: os.remove(self.save_path) except: pass # May just not have been created yet try: os.rename(backup, self.save_path) QtGui.QMessageBox.information(self, 'MiasMod', 'Succesfully restored backup') except: QtGui.QMessageBox.warning(self, 'MiasMod', '%s while restoring %s\n\n%s' \ % (e.__class__.__name__, backup, str(e))) return return True
def main(argv): hlog.flags() random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) torch.manual_seed(FLAGS.seed) #input_symbols_list = set(['red', 'yellow', 'green', 'blue', 'purple', 'pink', 'around', 'thrice', 'after']) input_symbols_list = set([ 'dax', 'lug', 'wif', 'zup', 'fep', 'blicket', 'kiki', 'tufa', 'gazzer' ]) output_symbols_list = set( ['RED', 'YELLOW', 'GREEN', 'BLUE', 'PURPLE', 'PINK']) study, test = generate_fig2_exp(input_symbols_list, output_symbols_list) vocab_x = Vocab() vocab_y = Vocab() if FLAGS.full_data: for sym in input_symbols_list: vocab_x.add(sym) for sym in output_symbols_list: vocab_y.add(sym) max_len_x = 7 max_len_y = 9 else: test, study = study[3:4], study[0:3] for (x, y) in test + study: for sym in x: vocab_x.add(sym) for sym in y: vocab_y.add(sym) max_len_x = 2 max_len_y = 2 hlog.value("vocab_x", vocab_x) hlog.value("vocab_y", vocab_y) hlog.value("study", study) hlog.value("test", test) train_items, test_items = encode(study, vocab_x, vocab_y), encode(test, vocab_x, vocab_y) # outlist = list(output_symbols_list) oracle_py = Oracle(train_items, test_items, DEVICE, dist="py", vocab_x=vocab_x, vocab_y=vocab_y) oracle_px = Oracle(train_items, test_items, DEVICE, dist="px", vocab_x=vocab_x, vocab_y=vocab_y) oracle_qxy = Oracle(train_items, test_items, DEVICE, dist="qxy", vocab_x=vocab_x, vocab_y=vocab_y) model = Mutex( vocab_x, vocab_y, FLAGS.dim, FLAGS.dim, oracle_py, max_len_x=max_len_x, max_len_y=max_len_y, copy=False, n_layers=FLAGS.n_layers, self_att=False, dropout=FLAGS.dropout, lamda=FLAGS.lamda, kl_lamda=FLAGS.kl_lamda, Nsample=FLAGS.Nsample, temp=FLAGS.temp, regularize=FLAGS.regularize, ent=FLAGS.ent, ).to(DEVICE) if FLAGS.regularize and not isinstance(model.px, Oracle): with hlog.task("pretrain px"): pretrain(model.px, train_items + test_items, test_items) for p in model.px.parameters(): p.requires_grad = False with hlog.task("Initial Samples"): hlog.value("px samples", "\n".join(model.sample_px(20))) hlog.value("py samples", "\n".join(model.sample_py(20))) hlog.value("qxy debug samples", "\n".join(model.sample_qxy_debug(N=20))) hlog.value( "qxy debug data", "\n".join(model.sample_qxy_debug_data(train_items + test_items))) # hlog.value("qxy samples", "\n".join(model.sample_qxy(model.py.sample(20,max_len),temp=model.temp))) # hlog.value("qxy samples (gumbel)", "\n".join(model.sample_qxy_gumbel(model.py.sample(20,max_len),temp=model.temp))) # if not isinstance(model.qxy,Oracle): # train(model.qxy, swap_io(train_items) + swap_io(test_items), swap_io(test_items)) # if not isinstance(model.pyx,Oracle): # train(model.pyx, train_items + test_items, test_items) # for param in model.pyx.parameters(): # param.requires_grad = False with hlog.task("train model"): acc, f1 = train(model, train_items, test_items) with hlog.task("Final Samples"): hlog.value("px samples", "\n".join(model.sample_px(20))) hlog.value("py samples", "\n".join(model.sample_py(20))) hlog.value("qxy debug samples", "\n".join(model.sample_qxy_debug(N=20))) hlog.value( "qxy debug data", "\n".join(model.sample_qxy_debug_data(train_items + test_items))) hlog.value( "qxy samples (gumbel)", "\n".join( model.sample_qxy_gumbel(model.py.sample(20, max_len_y), temp=model.temp))) #hlog.value("qxy samples", "\n".join(model.sample_qxy(model.py.sample(20,max_len),temp=model.temp))) if FLAGS.regularize: losses = pd.DataFrame(model.loss_container) figure = sns.lineplot(data=losses, dashes=False).figure figure.savefig(f"{FLAGS.seed}_plot.png") with hlog.task("train evaluation"): validate(model, train_items, vis=True) with hlog.task("test evaluation"): validate(model, test_items, vis=True)
validation_df.shape)) # Augment training data train_df = data.augment_data(train_df, test_df, use_xnli=args.load_xnli, use_mnli=args.load_mnli, use_bt=args.back_translate, bt_filepath=args.bt_file) # Define the tokenizer to preprocess the input data tokenizer = data.define_tokenizer(args.model_name) # Batch encode input training data train_input = data.encode(train_df, tokenizer, max_len=args.max_sequence_length) input_word_ids = train_input['input_word_ids'] input_mask = train_input['input_mask'] labels = train_input['labels'] print( "Training input shape: input_word_ids=>{}, input_mask=>{}, labels=>{}". format(input_word_ids.shape, input_mask.shape, labels.shape)) # Batch encode input validation data validation_input = data.encode(validation_df, tokenizer, max_len=args.max_sequence_length) validation_word_ids = validation_input['input_word_ids'] validation_mask = validation_input['input_mask'] validation_labels = validation_input['labels']
def send_data(self, data): self.soc.send(data.encode()) resp = self.soc.recv(self._data_.get_buffer_len()) print("[NETWORK]\tResponse recived") return resp.decode()
import itertools # checkpoint = './checkpoints/WT2.pt' checkpoint = '../models/BNC.18hr.QRNN.pt' data = 'data/bnc' torch.cuda.set_device(3) device = torch.device(3) torch.manual_seed(1234) with open(checkpoint, 'rb') as f: model, criterion, _ = torch.load(f, map_location=device) import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(data.encode()).hexdigest()) if os.path.exists(fn): print('loading cached dataset...') corpus = torch.load(fn) else: print('producing dataset...') corpus = data.corpus(args.data) torch.save(corpus, fn) dictionary = corpus.dictionary def tokenize_sent(sent): return torch.LongTensor([dictionary.word2idx[x] for x in sent]).cuda()
from data import setup, encode, onehotencode from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=[0, 1]) __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) # Data set, now combined into one file df = pd.read_csv(os.path.join(__location__, 'adult.data'), header=None) # we have no header so our columns are indexes cols_to_encode = [1, 3, 5, 6, 7, 8, 9, 13] # & 14 as label encode cols_to_scale = [0, 2, 4, 10, 11, 12] df = pd.get_dummies(df, columns=cols_to_encode) df, labels = encode(df, [14]) # Scale the continuious values su df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale]) # Code to force balance among target class # print('Balancing Adult data set... please wait') # greater_than_50k = np.where(df[14] == 1)[0] # 11K # less_than_50k = np.where(df[14] == 0)[0] # lt_samples = np.random.choice(less_than_50k, greater_than_50k.shape[0]) # match the less than 50k to greater than 50k in size # balanced = [] # for index in greater_than_50k: # balanced.append(df.iloc[index]) # for index in lt_samples: # balanced.append(df.iloc[index]) # balanced = pd.DataFrame(data=balanced, columns=df.columns)
max_no = 10 model = Sequential() model.add(RNN(100, input_shape=(seq_len, max_no))) model.add(Dropout(0.25)) model.add(RepeatVector(seq_len)) model.add(RNN(100, return_sequences=True)) model.add(TimeDistributedDense(max_no)) model.add(Dropout(0.5)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) for ind, (X, Y) in enumerate(batch_gen(batch_size, seq_len, max_no)): loss, acc = model.train_on_batch(X, Y) if ind % 250 == 0: testX = np.random.randint(max_no, size=(1, seq_len)) test = encode(testX, seq_len, max_no) print testX #pdb.set_trace() y = model.predict(test, batch_size=1) print "actual sorted output is" print np.sort(testX) print "sorting done by RNN is" print np.argmax(y, axis=2) print "\n" # print loss, acc
## Transform the documents using the vocabulary. en_x = np.array(list(en_vocab_processor.fit_transform(en_text))) sh_x = np.array(list(sh_vocab_processor.fit_transform(sh_text))) ## Extract word:id mapping from the object. en_vocab_dict = en_vocab_processor.vocabulary_._mapping sh_vocab_dict = sh_vocab_processor.vocabulary_._mapping ## Sort the vocabulary dictionary on the basis of values(id). ## Both statements perform same task. #sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1)) en_sorted_vocab = sorted(en_vocab_dict.items(), key=lambda x: x[1]) sh_sorted_vocab = sorted(sh_vocab_dict.items(), key=lambda x: x[1]) ## Treat the id's as index into list and create a list of words in the ascending order of id's ## word with id i goes at index i of the list. en_vocabulary = list(list(zip(*en_sorted_vocab))[0]) sh_vocabulary = list(list(zip(*sh_sorted_vocab))[0]) print("Vocabulary : ") print(en_vocabulary) with open("vocab2.en", 'wt') as o: for w in en_vocabulary: o.write(encode(w)) print(sh_vocabulary) print("Transformed documents : ") print(en_x) print(sh_x)