def main(): # prepare vocab if not (os.path.exists(conf.vocab_file) and os.path.getsize(conf.vocab_file)): logger.info(("word dictionary does not exist, " "build it from the training data")) build_dict(conf.train_file, conf.vocab_file, conf.max_word_num, conf.cutoff_word_fre) logger.info("load word dictionary.") word_dict = load_dict(conf.vocab_file) logger.info("dictionay size = %d" % (len(word_dict))) cost = rnn_lm(len(word_dict), conf.emb_dim, conf.hidden_size, conf.stacked_rnn_num, conf.rnn_type) # define reader reader_args = { "file_name": conf.train_file, "word_dict": word_dict, } train_reader = paddle.batch(paddle.reader.shuffle( reader.rnn_reader(**reader_args), buf_size=102400), batch_size=conf.batch_size) test_reader = None if os.path.exists(conf.test_file) and os.path.getsize(conf.test_file): test_reader = paddle.batch(paddle.reader.shuffle( reader.rnn_reader(**reader_args), buf_size=65536), batch_size=conf.batch_size) train(topology=cost, train_reader=train_reader, test_reader=test_reader, model_save_dir=conf.model_save_dir, num_passes=conf.num_passes)
def main(args): train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) # code.interact(local=locals()) if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load( open(args.vocab_file, "rb")) else: en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) args.en_total_words = en_total_words args.cn_total_words = cn_total_words inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) code.interact(local=locals())
def main(args): # preprocessing: word(en, cn) -> number(one hot vector) # load sentences (English and Chinese) train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} args.en_total_words = en_total_words args.cn_total_words = cn_total_words # encode the words into numbers train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) # convert the train and dev data into numpy matrices # batch_size * seq_length train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) model = models.EncoderDecoderModel() crit = utils.LanguageModelCriterion() learning_rate = args.learning_rate optimizer = optim.Adam(model.parameter(), lr=learning_rate) for epoch in range(args.num_epochs): for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in enumerate(train_data): # convert numpy ndarray to Pytorch tensor # convert to Pytorch Variable batch_size = mb_x.shape[0] mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])).long() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) # calculate loss function loss = crit(mb_pred, mb_out, mb_out_mask) # update the model optimizer.zero_grad() # zero the previous gradient loss.backward() # calculate gradient optimizer.step() # gradient descent
def main(): tweets, emojis = utils.load_data(max_example=100) word_dict = utils.build_dict(tweets) # embeddings = utils.generate_embeddings(word_dict, dim=50, pretrained_path='data/glove.twitter.27B.50d.txt') embeddings = utils.generate_embeddings(word_dict, dim=50, pretrained_path=None)
def transform_data(self): self.train_facts = add_reverse_relations(self.train_facts) self.entity_dict, self.relation_dict = build_dict( itertools.chain(self.train_facts, add_reverse_relations(self.test_facts), add_reverse_relations(self.valid_facts)), entity_dict=self.entity_dict, relation_dict=self.relation_dict) self.id2entity = sorted(self.entity_dict.keys(), key=self.entity_dict.get) self.id2relation = sorted(self.relation_dict.keys(), key=self.relation_dict.get) self.train_facts = translate_facts(self.train_facts, entity_dict=self.entity_dict, relation_dict=self.relation_dict) self.valid_facts = translate_facts(self.valid_facts, entity_dict=self.entity_dict, relation_dict=self.relation_dict) self.test_facts = translate_facts(self.test_facts, entity_dict=self.entity_dict, relation_dict=self.relation_dict) if self.rel2candidate: self.rel2candidate = { self.relation_dict[key]: list(map(self.entity_dict.get, value)) for key, value in self.rel2candidate.items() if key in self.relation_dict } else: relations = set(map(lambda x: x[1], self.valid_facts)) | set( map(lambda x: x[1], self.test_facts)) self.rel2candidate = { key: list(range(len(self.entity_dict))) for key in relations }
def train(self, dataset_train, labels, word_frequency = 15, document_frequency = 5): start = time.time() print("--------------------------------------------") print("%s Train Start" % self.name) self.labels = labels self.global_dict = utils.build_dict(dataset_train, word_frequency = word_frequency, document_frequency = document_frequency) self.num = len(self.global_dict) train_count = len(dataset_train) self.labels_word_total = {} self.labels_word_freq = {} self.labels_word_num = {} # 计算 每类文档 的概率 for label in labels: self.labels_p[label] = len(utils.GetFileLists(os.path.join(train_path, label))) * 1.0 / train_count # 统计 每类文档 中的词频 for name, data in dataset_train.items(): label = name.split("/")[-2] if self.labels_word_freq.get(label) is None: self.labels_word_freq[label] = {} if self.labels_word_total.get(label) is None: self.labels_word_total[label] = 0 self.labels_word_num[label] = 0 for word, count in data.items(): if self.global_dict.get(word) is None: continue if self.labels_word_freq[label].get(word) is None: self.labels_word_freq[label][word] = 1 else: self.labels_word_freq[label][word] += 1 self.labels_word_total[label] += 1 # 计算每个词的概率 for label, data in self.labels_word_freq.items(): if self.labels_word_p.get(label) is None: self.labels_word_p[label] = {} for word, count in data.items(): if self.global_dict.get(word) is not None: self.labels_word_p[label][word] = (count * 1.0 + 1) / (self.labels_word_total[label] + self.num) stop = time.time() print("%s Train Finished has cost %fs" % (self.name, stop - start))
def main(): # prepare vocab if not (os.path.exists(config.dic_path) and os.path.getsize(config.dic_path)): logger.info(("word dictionary does not exist, " "build it from the training data")) build_dict(config.train_data_path, config.dic_path, config.max_word_num, config.cutoff_word_fre) logger.info("load word dictionary.") word_dict = load_dict(config.dic_path) logger.info("dictionary size = %d" % (len(word_dict))) train(train_data_path=config.train_data_path, test_data_path=config.test_data_path, word_dict=word_dict, batch_size=config.batch_size, num_passes=config.num_passes, share_semantic_generator=config.share_semantic_generator, share_embed=config.share_embed, num_workers=config.num_workers, use_gpu=config.use_gpu)
def init(): path = config.data_path config.embedding_file = os.path.join(path, config.embedding_file) config.embedding_vocab = os.path.join(path, config.embedding_vocab) config.train_file = os.path.join(path, config.train_file) config.test_file = os.path.join(path, config.test_file) # Config log if config.log_file is None: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') else: if not os.path.exists(config.save_path): os.makedirs(config.save_path) logging.basicConfig(filename=config.log_file, filemode='a', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d %H:%M') # Load data # data = (sentences, relations, e1_pos, e2_pos) train_data = utils.load_data(config.train_file) test_data = utils.load_data(config.test_file) logging.info('trian data: %d' % len(train_data[0])) logging.info('test data: %d' % len(test_data[0])) # Build vocab word_dict = utils.build_dict(train_data[0] + test_data[0]) logging.info('total words: %d' % len(word_dict)) embeddings = utils.load_embedding(config, word_dict) # Log parameters flags = config.__dict__['__flags'] flag_str = "\n" for k in flags: flag_str += "\t%s:\t%s\n" % (k, flags[k]) logging.info(flag_str) # vectorize data # vec = (sents_vec, relations, e1_vec, e2_vec, dist1, dist2) max_len_train = len(max(train_data[0], key=lambda x: len(x))) max_len_test = len(max(test_data[0], key=lambda x: len(x))) max_len = max(max_len_train, max_len_test) config.max_len = max_len train_vec = utils.vectorize(train_data, word_dict, max_len) test_vec = utils.vectorize(test_data, word_dict, max_len) return embeddings, train_vec, test_vec
def handler(req, args): session_id = utils.get_cookie(req, "session").strip() if session_id == "": logging.warning('Unathorized attempt to access %s from %s' % (req.the_request, req.connection.remote_ip)) return {"Location":"login.html", "error_msg":"Authorization required!"} con = MySQLdb.connect( host = settings.database_settings["host"], user = settings.database_settings["login"], passwd = settings.database_settings["password"], db = settings.database_settings["database"]) cur = con.cursor() try: expired, user_id = utils.is_expired(cur, session_id) if expired: return {"Location":"login.html", "error_msg":"You session has expired. Please log in"} if not args.has_key("id"): return {"Location":"editnews.html"} id = args["id"].strip() try: id = int(id) except ValueError: return {"Location":"editnews.html"} if not preprocess.input_matches(req, args, expected_args): cur.execute("""SELECT title, text FROM news WHERE id=%s """, id) row = cur.fetchone() if row is None: return {"Location":"editnews.html"} title = row[0] text = row[1] return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, [id, title, text])) title = args["title"].strip() text = args["body"] xss_strip = xss.XssCleaner() title = xss_strip.strip(title) text = xss_strip.strip(text) cur.execute("""UPDATE news SET title=%s, text=%s WHERE id=%s""", (title, text, id)) finally: con.commit() cur.close() con.close() return {"Location":"editnews.html", "notice_msg":"Post saved successfully"}
def handler(req, args): if not preprocess.input_matches(req, args, expected_args): return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, [])) expresion = args["expresion"].strip() if expresion == "": return postprocess.fill_page(template_path, "", "The expresion is empty", args) try: contents = str(eval(expresion)) except ZeroDivisionError: return postprocess.fill_page(template_path, "", "Division by Zero", args) except ValueError, OverflowError: return postprocess.fill_page(template_path, "", "Some function in expression does not support specified domain", args)
def get(self): article = RequestHandler.get_argument(self, name='article') print('-----------------enter get...') article = translator.translate(article, src='auto', dest='en').text.lower().replace('.', ' .').replace(',', ' ,') try: print('---article_cn in get:', article) print('---article_en:', article) except Exception as e: print(str(e)) pass print("Loading dictionary...") word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy) valid_x, valid_y = build_deploy(article, word_dict, article_max_len, summary_max_len) valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x)) batches = batch_iter(valid_x, valid_y, args.batch_size, 1) print("Start auto summarization...") for batch_x, batch_y in batches: batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x)) valid_feed_dict = { model.batch_size: len(batch_x), model.X: batch_x, model.X_len: batch_x_len, } t0 = time.time() prediction = sess.run(model.prediction, feed_dict=valid_feed_dict) prediction_output = list(map(lambda x: [reversed_dict[y] for y in x], prediction[:, 0, :])) print('inference time:', str(time.time() - t0) + 's') line = prediction_output[0] summary = list() for word in line: if word == "</s>": break if word not in summary: summary.append(word) title_pred = " ".join(summary) print('title_pred:', title_pred) title_cn = translator.translate(title_pred, src='auto', dest='zh-cn').text # print('title_cn:', title_cn) self.write(str(title_cn) + '\n')
def main(args): args = config.get_args() print(args) entity_type=args.entity_type embeddings=args.embeddings split=False if split: # RUN THIS LINE TO CREATE THE SPLITS TO TRAIN-DEV-TEST train_file, outdir = create_data(entity_type, embeddings, args.experiment) else: # RUN THIS LINE IF YOU HAVE ALREADY DONE THE SPLITTING AND WANT TO ONLY CREATE EMBEDDINGS train_file='../data/%s/train.txt' % entity_type outdir='../data/%s/' % entity_type logging.info('-' * 50) logging.info('Load data files..') logging.info('*' * 10 + ' Train') train_examples = utils.load_data(train_file, embeddings) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts, inv_word_dicts = utils.build_dict(train_examples, 3000) num_attr = len(inv_word_dicts) d_abs=1 for i in inv_word_dicts: print(len(i)) d_abs*=len(i) print("d_abs = %s" % "{:.2E}".format(Decimal(d_abs))) print("n_ex = %d" % len(train_examples)) print("d_avgd = %s" % "{:.2E}".format(Decimal(d_abs/len(train_examples)))) entropy = utils.compute_avg_entropy(train_examples, word_dicts) print("Entropy = %f" % entropy) pickle.dump(word_dicts, open('%s/train_dicts.pickle' % outdir, 'wb')) pickle.dump(inv_word_dicts, open('%s/train_inv_dicts.pickle' % outdir, 'wb'))
def data_loader(args): train_data, train_labels = utils.get_raw_data(args.train_file) # 获取一堆句子构成的列表 val_data, val_labels = utils.get_raw_data(args.dev_file) args.catogories = ['EnterSports', 'Military', 'Economics', 'Technology', 'Government'] args.cat_dict = dict(zip(args.catogories, range(len(args.catogories)))) word_vocab, num_total_words = utils.build_dict(train_data) trainlabels_to_idx = [args.cat_dict[label] for label in train_labels] vallabels_to_idx = [args.cat_dict[label] for label in val_labels] train_data, train_labels = utils.encode(train_data, trainlabels_to_idx, word_vocab) val_data, val_labels = utils.encode(val_data, vallabels_to_idx, word_vocab) train_data = utils.pad_features(train_data, max_len=args.max_features) val_data = utils.pad_features(val_data, max_len=args.max_features) train_set = utils.batch(train_data.copy(), train_labels.copy(), args.batch_size) val_set = utils.batch(val_data.copy(), val_labels.copy(), args.batch_size) return train_set, val_set, num_total_words
def handler(req, args): session_id = utils.get_cookie(req, "session").strip() if session_id == "": logging.warning('Unathorized attempt to access %s from %s' % (req.the_request, req.connection.remote_ip)) return {"Location":"login.html", "error_msg":"Authorization required!"} con = MySQLdb.connect( host = settings.database_settings["host"], user = settings.database_settings["login"], passwd = settings.database_settings["password"], db = settings.database_settings["database"]) cur = con.cursor() try: expired, user_id = utils.is_expired(cur, session_id) if expired: return {"Location":"login.html", "error_msg":"You session has expired. Please log in"} if not preprocess.input_matches(req, args, expected_args): return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, [])) title = args["title"].strip() text = args["body"] xss_strip = xss.XssCleaner() title = xss_strip.strip(title) text = xss_strip.strip(text) cur.execute("""INSERT INTO news (date, title, author, text) VALUES (now(), %s, %s, %s)""", (title, user_id, text)) finally: con.commit() cur.close() con.close() return {"Location":"news.html", "notice_msg":"Post added successfully"}
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') if args.prepare_model: return train_fn, test_fn, params logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y = utils.vectorize( train_examples, word_dict, entity_dict) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train)) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
def main(args): train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) # code.interact(local=locals()) if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load( open(args.vocab_file, "rb")) else: en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) args.en_total_words = en_total_words args.cn_total_words = cn_total_words inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() crit = utils.LanguageModelCriterion() learning_rate = args.learning_rate optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epochs): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): batch_size = mb_x.shape[0] total_num_sentences += batch_size mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words optimizer.zero_grad() loss.backward() optimizer.step() print("training loss: %f" % (total_train_loss / total_num_words))
def handler(req, args): if args.has_key("error_msg"): error_msg = args["error_msg"] else: error_msg = "" if not preprocess.input_matches(req, args, expected_args): return postprocess.fill_page(template_path, "", error_msg, utils.build_dict(expected_args, [])) elif args.has_key("error_msg"): return postprocess.fill_page(template_path, "", error_msg, args) login = args["login"].strip() passwd = args["passwd"] if login == "" or passwd == "": return postprocess.fill_page(template_path, "", "Username or password not specified", args) con = MySQLdb.connect( host = settings.database_settings["host"], user = settings.database_settings["login"], passwd = settings.database_settings["password"], db = settings.database_settings["database"]) cur = con.cursor() try: cur.execute(""" SELECT logins.id, login, passwd, name, surname FROM logins JOIN emails ON logins.emails_ref = emails.id WHERE login=%s and passwd=%s""", (login, md5.new(passwd).digest()) ) result = cur.fetchone() if result is None: return postprocess.fill_page(template_path, "", "Bad username or password", args) name = result[3] surname = result[4] #generate session id session_id = guid.generate("") # ANALYSIS FIX expire_time = utils.get_session_expire_time() cur.execute("""DELETE FROM sessions WHERE expire_time < now()""") cur.execute("""INSERT INTO sessions (session_id, expire_time, user_id) VALUES (%s, %s, %s) """, (session_id, expire_time, result[0])) #set cookie req.headers_out.add( "Set-Cookie", utils.forge_cookie("session", session_id, "/")) #process statistics UserAgent = "" if req.headers_in.has_key("User-Agent"): UserAgent = req.headers_in["User-Agent"] if UserAgent.find("/") > 0: UserAgent = UserAgent[0:UserAgent.find("/")] if len(UserAgent) > 0: cur.execute("SELECT id FROM stat_browser WHERE browser = '%s'" % (UserAgent,)) result = cur.fetchone() if result is None: cur.execute("INSERT INTO stat_browser (browser, counter) VALUES (%s, %s)", (UserAgent, 1)) else: cur.execute("UPDATE stat_browser SET counter = counter + 1 WHERE id = %s", (result[0], )) finally: con.commit() cur.close() con.close() logging.info('Login of %s from %s' % (login, req.connection.remote_ip)) return {"Location":"news.html", "notice_msg":"Hello, %s %s!" % (name, surname)}
def handler(req, args): if not preprocess.input_matches(req, args, expected_args): return postprocess.fill_page(template_path, "", "", utils.build_dict(expected_args, [])) login = args["login"].strip() name = cgi.escape(args["name"].strip()) surname = cgi.escape(args["surname"].strip()) email = args["email"].strip() passwd = args["passwd"] if login == "": return postprocess.fill_page(template_path, "", "'login' is required field and cannot be empty", args) if validator.invalid_login_re.search(login): return postprocess.fill_page(template_path, "", "Only characters, numbers and underscore are allowed in login", args) if passwd == "": return postprocess.fill_page(template_path, "", "'password' is required field and cannot be empty", args) if passwd != args["passwd_confirm"]: return postprocess.fill_page(template_path, "", "Entered passwords do not match", args) if validator.invalid_passwd_re.search(passwd): return postprocess.fill_page(template_path, "", "Whitespaces are not allowed in passwords", args) if email == "": return postprocess.fill_page(template_path, "", "'email' is required field and cannot be empty", args) if not validator.valid_email_re.match(email): return postprocess.fill_page(template_path, "", "You have entered email address in bad format", args) con = MySQLdb.connect( host = settings.database_settings["host"], user = settings.database_settings["login"], passwd = settings.database_settings["password"], db = settings.database_settings["database"]) cur = con.cursor() try: #check if this login was not used cur.execute("SELECT login FROM logins WHERE login=%s", (login, ) ) result = cur.fetchone() if result: return postprocess.fill_page(template_path, "", "The specified login is already used by someone", args) #check if this email was already inserted cur.execute("""SELECT id, email FROM emails WHERE email='%s'""" % (email, ) ) result = cur.fetchone() if result is None: cur.execute("""INSERT INTO emails (email, name, surname) VALUES (%s, %s, %s)""", (email, name, surname)) cur.execute("""SELECT LAST_INSERT_ID() """) result = cur.fetchone() cur.execute("""INSERT INTO logins (login, passwd, emails_ref) VALUES (%s, %s, %s)""", (login, md5.new(passwd).digest(), int(result[0]))) finally: con.commit() cur.close() con.close() return {"Location":"login.html", "notice_msg":"Registration successful!"}
def main(): start = timer() if (os.path.isfile("data/tweets" + str(max_example) + ".npy") and os.path.isfile("data/emojis" + str(max_example) + ".npy")): tweets = np.load("data/tweets" + str(max_example) + ".npy").tolist() emojis = np.load("data/emojis" + str(max_example) + ".npy").tolist() else: tweets, emojis = utils.load_data(path='data/final_train', max_example=max_example) np.save("data/tweets" + str(max_example) + ".npy", np.array(tweets)) np.save("data/emojis" + str(max_example) + ".npy", np.array(emojis)) if (os.path.isfile("data/dev_tweets" + str(max_dev_example) + ".npy") and os.path.isfile("data/dev_emojis" + str(max_dev_example) + ".npy")): dev_tweets = np.load("data/dev_tweets" + str(max_dev_example) + ".npy").tolist() dev_emojis = np.load("data/dev_emojis" + str(max_dev_example) + ".npy").tolist() else: dev_tweets, dev_emojis = utils.load_data(max_example=max_dev_example) np.save("data/dev_tweets" + str(max_dev_example) + ".npy", np.array(dev_tweets)) np.save("data/dev_emojis" + str(max_dev_example) + ".npy", np.array(dev_emojis)) start1 = timer() print(start1 - start) word_dict = utils.build_dict(tweets) # embeddings = utils.generate_embeddings(word_dict, dim=300, pretrained_path='data/glove.6B.300d.txt') embeddings = utils.generate_embeddings(word_dict, dim=300, pretrained_path=None) end0 = timer() print(end0 - start1) x, y = utils.vectorize(tweets, emojis, word_dict) dev_x, dev_y = utils.vectorize(dev_tweets, dev_emojis, word_dict) end1 = timer() print(end1 - end0) batch_size, input_size, hidden_size, output_size, layers = 32, 300, 200, 20, 1 all_train = utils.generate_batches(x, y, batch_size=batch_size) all_dev = utils.generate_batches(dev_x, dev_y, batch_size=batch_size) end2 = timer() print(end2 - end1) # set the parameters # batch_size, input_size, hidden_size, output_size, layers = 64, 50, 200, 20, 1 vocabulary_size = len(embeddings) if run_GRU: print("running GRU...") # initialize the model model = GRU_Classifier(vocabulary_size, input_size, hidden_size, output_size, layers, run_BD_GRU) model.word_embeddings.weight.data = torch.FloatTensor( embeddings.tolist()) if torch.cuda.is_available(): model.cuda() (model.word_embeddings.weight.data).cuda() loss_function = nn.CrossEntropyLoss() if torch.cuda.is_available(): loss_function.cuda() optimizer = optim.Adam(model.parameters(), lr=global_learning_rate) epoch_num = 500 it = 0 best_dev_acc = 0 best_f1 = 0 # model training for epoch in range(epoch_num): np.random.shuffle(all_train) for idx, (mb_x, mb_y, mb_lengths) in enumerate(all_train): # sort the input in descending order according to sentence length # This is required by nn.utils.rnn.pack_padded_sequence sorted_index = len_value_argsort(mb_lengths) mb_x = [mb_x[i] for i in sorted_index] mb_y = [mb_y[i] for i in sorted_index] mb_lengths = [mb_lengths[i] for i in sorted_index] print('#Examples = %d, max_seq_len = %d' % (len(mb_x), len(mb_x[0]))) mb_x = Variable(torch.from_numpy(np.array(mb_x, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): mb_x = mb_x.cuda() y_pred = model(mb_x.t(), mb_lengths) mb_y = Variable(torch.from_numpy(np.array(mb_y, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): mb_y = mb_y.cuda() loss = loss_function(y_pred, mb_y) # print('epoch ', epoch, 'batch ', idx, 'loss ', loss.data[0]) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() it += 1 if it % 100 == 0: # every 100 updates, check dev accuracy correct = 0 n_examples = 0 ground_truth = [] predicted = [] for idx, (d_x, d_y, d_lengths) in enumerate(all_dev): ground_truth += d_y n_examples += len(d_x) sorted_index = len_value_argsort(d_lengths) d_x = [d_x[i] for i in sorted_index] d_y = [d_y[i] for i in sorted_index] d_lengths = [d_lengths[i] for i in sorted_index] d_x = Variable(torch.from_numpy( np.array(d_x, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): d_x = d_x.cuda() # use pytorch way to calculate the correct count d_y = Variable(torch.from_numpy( np.array(d_y, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): d_y = d_y.cuda() y_pred = model(d_x.t(), d_lengths) predicted += list( torch.max(y_pred, 1)[1].view(d_y.size()).data) correct += (torch.max(y_pred, 1)[1].view( d_y.size()).data == d_y.data).sum() dev_acc = correct / n_examples f1 = f1_score(ground_truth, predicted, average='macro') print("Dev Accuracy: %f, F1 Score: %f" % (dev_acc, f1)) if f1 > best_f1: best_f1 = f1 print("Best F1 Score: %f" % best_f1) gru_output = open('./out/gru_best', 'w') gru_output.write(str(ground_truth) + '\n') gru_output.write(str(predicted) + '\n') gru_output.write(str(best_f1) + ' ' + str(dev_acc)) gru_output.close() if dev_acc > best_dev_acc: best_dev_acc = dev_acc print("Best Dev Accuracy: %f" % best_dev_acc) if run_LSTM: print("Running LSTM...") model = LSTM_Classifier(vocabulary_size, input_size, hidden_size, output_size, layers, run_BD_LSTM) model.word_embeddings.weight.data = torch.FloatTensor( embeddings.tolist()) if torch.cuda.is_available(): model.cuda() (model.word_embeddings.weight.data).cuda() loss_function = nn.CrossEntropyLoss() if torch.cuda.is_available(): loss_function.cuda() optimizer = optim.Adam(model.parameters(), lr=global_learning_rate) it = 0 best_dev_acc = 0 best_f1 = 0 epoch_num = 500 # train LSTM for epoch in range(epoch_num): np.random.shuffle(all_train) for idx, (mb_x, mb_y, mb_lengths) in enumerate(all_train): sorted_index = len_value_argsort(mb_lengths) mb_x = [mb_x[i] for i in sorted_index] mb_y = [mb_y[i] for i in sorted_index] mb_lengths = [mb_lengths[i] for i in sorted_index] print('#Examples = %d, max_seq_len = %d' % (len(mb_x), len(mb_x[0]))) mb_x = Variable(torch.from_numpy(np.array(mb_x, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): mb_x = mb_x.cuda() y_pred = model(mb_x.t(), mb_lengths) mb_y = Variable(torch.from_numpy(np.array(mb_y, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): mb_y = mb_y.cuda() loss = loss_function(y_pred, mb_y) # print('epoch ', epoch, 'batch ', idx, 'loss ', loss.data[0]) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() it += 1 if it % 100 == 0: # every 100 updates, check dev accuracy correct = 0 n_examples = 0 ground_truth = [] predicted = [] for idx, (d_x, d_y, d_lengths) in enumerate(all_dev): ground_truth += d_y n_examples += len(d_x) sorted_index = len_value_argsort(d_lengths) d_x = [d_x[i] for i in sorted_index] d_y = [d_y[i] for i in sorted_index] d_lengths = [d_lengths[i] for i in sorted_index] d_x = Variable(torch.from_numpy( np.array(d_x, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): d_x = d_x.cuda() d_y = Variable(torch.from_numpy( np.array(d_y, dtype=np.int64)), requires_grad=False) if torch.cuda.is_available(): d_y = d_y.cuda() y_pred = model(d_x.t(), d_lengths) predicted += list( torch.max(y_pred, 1)[1].view(d_y.size()).data) correct += (torch.max(y_pred, 1)[1].view( d_y.size()).data == d_y.data).sum() dev_acc = correct / n_examples f1 = f1_score(ground_truth, predicted, average='macro') print("Dev Accuracy: %f, F1 Score: %f" % (dev_acc, f1)) if f1 > best_f1: best_f1 = f1 print("Best F1 Score: %f" % best_f1) lstm_output = open('./out/lstm_best', 'w') lstm_output.write(str(ground_truth) + '\n') lstm_output.write(str(predicted) + '\n') lstm_output.write(str(best_f1) + ' ' + str(dev_acc)) lstm_output.close() if dev_acc > best_dev_acc: best_dev_acc = dev_acc print("Best Dev Accuracy: %f" % best_dev_acc)
parser.add_argument("--num_epochs", type=int, default=128, help="Number of epochs.") parser.add_argument("--keep_prob", type=float, default=0.9, help="Dropout keep prob.") parser.add_argument("--restoreInTrain", type=bool, default=True, help="restore in train") parser.add_argument("--toy", action="store_true", help="Use only 50K samples of data") parser.add_argument("--with_model", action="store_true", help="Continue from previously saved model") parser.add_argument("--checkoutPath", type=str, default='saved_model/checkpoint', help='save path') parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() with open("args.pickle", "rb") as f: args = pickle.load(f) print("Loading dictionary...") word_dict, reversed_dict, article_max_len, summary_max_len = build_dict() print("Loading training dataset...") valid_x, valid_y = get_text_list1(flag="dev") valid_x_len = [len([y for y in x if y != 0]) for x in valid_x] with tf.Session() as sess: print("Loading saved model...") model = getModel(sess, reversed_dict, article_max_len, summary_max_len, args, forward=True) # model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True) # saver = tf.train.Saver(tf.global_variables()) # ckpt = tf.train.get_checkpoint_state("./saved_model/") # if ckpt: # saver.restore(sess, tf.train.latest_checkpoint(ckpt.model_checkpoint_path)) #batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1)
filelist = utils.GetFileLists(train_path) if not os.path.exists(tf_path): result = utils.ReadDirsToStem(raw_path) with open(file=tf_path, mode="w", encoding="ISO-8859-1") as f: f.write(str(result)) with open(file=tf_path, mode="r", encoding="ISO-8859-1") as f: result = eval(f.read()) result_new = {} num_document = len(filelist) print(num_document) for file in filelist: result_new[file] = result[os.path.join( raw_path, os.path.join(file.split("/")[-2], file.split("/")[-1]))] dic = utils.build_dict(result_new, word_frequency=word_frequency, document_frequency=document_frequency) dic_names = [] for key in dic: dic_names.append(key) vector_space = buildVSM(result_new, dic, dic_names, num_document)
if X_test[i][j] != 0: _prob += math.log( self.prob[int(c) - 1][j]) * X_test[i][j] if _prob > _max: _max = _prob _c = c y_pred.append(_c) return y_pred def accuracy(self, y_test, y_pred): count = 0 for i in range(len(y_test)): if str(y_test[i]) == str(y_pred[i]): count += 1 print('Acc: %.2f' % (count * 100 / len(y_test)), end=' %') if __name__ == '__main__': X_, y = utils.parse_file('training_data.txt') utils.build_dict(X_) DICT = utils.load_dict() X = np.zeros((len(X_), len(DICT))) for i in range(len(X_)): X[i] = utils.bag_of_word(X_[i], DICT) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) model = MultinomialNB(0.1) model.fit(X_train, y_train) print(X_test.shape[1]) y_pred = model.predict(X_test) model.accuracy(y_test, y_pred)
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 5, relabeling=args.relabeling, remove_notfound=args.remove_notfound) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, remove_notfound=False) #elif args.test_only: # logging.info('*' * 10 + ' Train') # #train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling) # docs, qs, ans # train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound) # docs, qs, ans # logging.info('*' * 10 + ' Dev') # dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, # remove_notfound=False) elif args.cnn_train: logging.info('*' * 10 + ' Train') train_examples = utils.load_cnn_data(args.train_file, relabeling=args.relabeling, has_ids=args.train_has_ids) # docs, qs, ans logging.info('*' * 10 + ' Dev') dev_examples = utils.load_cnn_data(args.dev_file, args.max_dev, relabeling=args.relabeling, has_ids=args.dev_has_ids) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling, remove_notfound=args.remove_notfound) # docs, qs, ans logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, remove_notfound=False) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1], # + dev_examples[0] + dev_examples[1], max_words=args.max_words) # docs+qs entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} inv_entity_dict = {index: w for w, index in entity_dict.items()} assert len(entity_dict) == len(inv_entity_dict) logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y, dev_ids = utils.vectorize(dev_examples, word_dict, entity_dict, remove_notfound=False, relabeling=args.relabeling) if dev_ids is not None: assert len(dev_y) == len(dev_ids) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc, dev_preds = eval_acc(test_fn, all_dev) if dev_ids is not None: assert len(dev_ids) == len(dev_preds) == len(dev_y) dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.log_file is not None: assert args.log_file.endswith(".log") run_name = args.log_file[:args.log_file.find(".log")] if dev_ids is not None: preds_file_name = run_name + ".preds" utils.write_preds(dev_preds_data, preds_file_name) utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="test" if "test" in os.path.basename(args.dev_file) else "dev") if args.test_only: return if args.log_file is not None: utils.save_params(run_name + ".model", params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y, train_ids = utils.vectorize(train_examples, word_dict, entity_dict, remove_notfound=args.remove_notfound, relabeling=args.relabeling) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 train_accs = [] dev_accs = [] all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) improved = [] for epoch in range(args.num_epoches): ep_acc_improved = False np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) train_acc, train_preds = eval_acc(test_fn, sample_train) train_accs.append(train_acc) logging.info('Train accuracy: %.2f %%' % train_acc) dev_acc, dev_preds = eval_acc(test_fn, all_dev) dev_accs.append(dev_acc) logging.info('Dev accuracy: %.2f %%' % dev_acc) utils.update_plot(args.eval_iter, train_accs, dev_accs, file_name=args.log_file + ".html") if dev_acc > best_acc: ep_acc_improved = True best_acc = dev_acc logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) if args.log_file is not None: utils.save_params(run_name + ".model", params, epoch=epoch, n_updates=n_updates) if dev_ids is not None: dev_preds_data = to_output_preds(dev_ids, dev_preds, inv_entity_dict, args.relabeling) utils.write_preds(dev_preds_data, preds_file_name) utils.external_eval(preds_file_name, run_name + ".preds.scores", eval_data="dev") improved.append(ep_acc_improved) # early stop if len(improved) > 25 and sum(improved[-3:]) == 0: break
# -*- coding: utf-8 -*- from pca import kpca_train, pca_train from utils import build_dict from cv2_implementation import detect_faces import sys if len(sys.argv) < 2: print('An input mode is required: \'pca\' or \'kpca\'') sys.exit(1) mode = sys.argv[1] if mode != 'pca' and mode != 'kpca': print('Invalid option. Input mode must be either \'pca\' or \'kpca\'') sys.exit(1) print('Please wait. Training is in progress...') if mode == 'pca': eigenfaces = pca_train() else: eigenfaces = kpca_train() print('Training ready.') names = build_dict() detect_faces(mode, eigenfaces, names)
def main(args): logging.info('-' * 50 + '') logging.info('Loading data...') if args.debug: train_examples = utils.load_data(args.train_file, 100) dev_examples = utils.load_data(args.dev_file, 100) else: train_examples = utils.load_data(args.train_file) dev_examples = utils.load_data(args.dev_file) args.num_train = len(train_examples[1]) args.num_dev = len(dev_examples[1]) logging.info('-' * 50) logging.info('Building dictionary...') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<entity_unk>'] + entity_markers entity_dict = {w: i for (i, w) in enumerate(entity_markers)} logging.info('# of Entity Markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) logging.info('Generating embedding...') embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) embeddings = embeddings.astype('float32') args.vocab_size, args.embedding_size = embeddings.shape logging.info('-' * 50) logging.info('Creating TF computation graph...') if args.rnn_type == 'lstm': logging.info('Using LSTM Cells') elif args.rnn_type == 'gru': logging.info('Using GRU Cells') # tf.reset_default_graph() d_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="d_input") q_input = tf.placeholder( dtype=tf.int32, shape=(None, None), name="q_input") # [batch_size, max_seq_length_for_batch] l_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="l_mask") # [batch_size, entity num] y = tf.placeholder(dtype=tf.int32, shape=None, name="label") # batch size vector y_1hot = tf.placeholder( dtype=tf.float32, shape=(None, None), name="label_1hot") # onehot encoding of y [batch_size, entitydict] training = tf.placeholder(dtype=tf.bool) word_embeddings = tf.get_variable( "glove", shape=(args.vocab_size, args.embedding_size), initializer=tf.constant_initializer(embeddings)) W_bilinear = tf.Variable( tf.random_uniform((2 * args.hidden_size, 2 * args.hidden_size), minval=-0.01, maxval=0.01)) with tf.variable_scope( 'd_encoder'): # Encoding Step for Passage (d_ for document) d_embed = tf.nn.embedding_lookup( word_embeddings, d_input ) # Apply embeddings: [batch, max passage length in batch, GloVe Dim] d_embed_dropout = tf.layers.dropout( d_embed, rate=args.dropout_rate, training=training) # Apply Dropout to embedding layer if args.rnn_type == 'lstm': d_cell_fw = rnn.LSTMCell(args.hidden_size) d_cell_bw = rnn.LSTMCell(args.hidden_size) elif args.rnn_type == 'gru': d_cell_fw = rnn.GRUCell( args.hidden_size ) # TODO: kernel_initializer=tf.random_normal_initializer(0,0.1) not working for 1.1 d_cell_bw = rnn.GRUCell(args.hidden_size) d_outputs, _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, d_embed_dropout, dtype=tf.float32) d_output = tf.concat( d_outputs, axis=-1 ) # [batch, len, h], len is the max passage length, and h is the hidden size with tf.variable_scope('q_encoder'): # Encoding Step for Question q_embed = tf.nn.embedding_lookup(word_embeddings, q_input) q_embed_dropout = tf.layers.dropout(q_embed, rate=args.dropout_rate, training=training) if args.rnn_type == 'lstm': q_cell_fw = rnn.LSTMCell(args.hidden_size) q_cell_bw = rnn.LSTMCell(args.hidden_size) elif args.rnn_type == 'gru': q_cell_fw = rnn.GRUCell(args.hidden_size) q_cell_bw = rnn.GRUCell(args.hidden_size) q_outputs, q_laststates = tf.nn.bidirectional_dynamic_rnn( q_cell_fw, q_cell_bw, q_embed_dropout, dtype=tf.float32) if args.rnn_type == 'lstm': q_output = tf.concat([q_laststates[0][-1], q_laststates[1][-1]], axis=-1) # (batch, h) elif args.rnn_type == 'gru': q_output = tf.concat(q_laststates, axis=-1) # (batch, h) with tf.variable_scope('bilinear'): # Bilinear Layer (Attention Step) # M computes the similarity between each passage word and the entire question encoding M = d_output * tf.expand_dims(tf.matmul(q_output, W_bilinear), axis=1) # [batch, h] -> [batch, 1, h] # alpha represents the normalized weights representing how relevant the passage word is to the question alpha = tf.nn.softmax(tf.reduce_sum(M, axis=2)) # [batch, len] # this output contains the weighted combination of all contextual embeddings bilinear_output = tf.reduce_sum(d_output * tf.expand_dims(alpha, axis=2), axis=1) # [batch, h] with tf.variable_scope('dense'): # Prediction Step # the final output has dimension [batch, entity#], giving the probabilities of an entity being the answer for examples final_prob = tf.layers.dense( bilinear_output, units=args.num_labels, activation=tf.nn.softmax, kernel_initializer=tf.random_uniform_initializer( minval=-0.01, maxval=0.01)) # [batch, entity#] pred = final_prob * l_mask # ignore entities that don't appear in the passage train_pred = pred / tf.expand_dims( tf.reduce_sum(pred, axis=1), axis=1) # redistribute probabilities ignoring certain labels train_pred = tf.clip_by_value(train_pred, 1e-7, 1.0 - 1e-7) test_pred = tf.cast(tf.argmax(pred, axis=-1), tf.int32) acc = tf.reduce_sum(tf.cast(tf.equal(test_pred, y), tf.int32)) loss_op = tf.reduce_mean( -tf.reduce_sum(y_1hot * tf.log(train_pred), reduction_indices=[1])) optimizer = tf.train.GradientDescentOptimizer( learning_rate=args.learning_rate) train_op = optimizer.minimize(loss_op) logging.info('Done!') logging.info('-' * 50) logging.info('Printing args...') logging.info(args) logging.info('-' * 50) logging.info('Initial Test...') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = 0. # TODO: first dev accuracy displays here logging.info('Dev Accuracy: %.2f %%' % dev_acc) best_acc = dev_acc saver = tf.train.Saver() logging.info('-' * 50) logging.info('Testing...') if args.test_only: if args.test_file == None: return ValueError("No test file specified") test_examples = utils.load_data(args.test_file) test_x1, test_x2, test_l, test_y = utils.vectorize( test_examples, word_dict, entity_dict) all_test = gen_examples(test_x1, test_x2, test_l, test_y, args.batch_size) with tf.Session() as sess: # saver = tf.train.import_meta_graph(args.model_path + '.meta') saver.restore(sess, args.model_path) # TODO: which file to restore? correct = 0 n_examples = 0 for t_x1, t_mask1, t_x2, t_mask2, t_l, t_y in all_test: correct += sess.run(acc, feed_dict={ d_input: t_x1, q_input: t_x2, y: t_y, l_mask: t_l, training: False }) n_examples += len(t_x1) test_acc = correct * 100. / n_examples logging.info('Test Accuracy: %.2f %%' % test_acc) return logging.info('-' * 50) logging.info('Start training...') train_x1, train_x2, train_l, train_y = utils.vectorize( train_examples, word_dict, entity_dict) all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) init = tf.global_variables_initializer() start_time = time.time() n_updates = 0 with tf.Session() as sess: sess.run(init) for e in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info( 'Batch Size = %d, # of Examples = %d, max_len = %d' % (mb_x1.shape[0], len(mb_x1), mb_x1.shape[1])) y_label = np.zeros((mb_x1.shape[0], args.num_labels)) for r, i in enumerate( mb_y): # convert (batch) -> (batch, entity_size) y_label[r][i] = 1. _, train_loss = sess.run( [train_op, loss_op], feed_dict={ d_input: mb_x1, q_input: mb_x2, y_1hot: y_label, l_mask: mb_l, training: True }) logging.info( 'Epoch = %d, Iter = %d (max = %d), Loss = %.2f, Elapsed Time = %.2f (s)' % (e, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: saver.save(sess, args.model_path, global_step=e) correct = 0 n_examples = 0 for d_x1, d_mask1, d_x2, d_mask2, d_l, d_y in all_dev: correct += sess.run(acc, feed_dict={ d_input: d_x1, q_input: d_x2, y: d_y, l_mask: d_l, training: False }) n_examples += len(d_x1) dev_acc = correct * 100. / n_examples logging.info('Dev Accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best Dev Accuracy: epoch = %d, n_updates (iter) = %d, acc = %.2f %%' % (e, n_updates, dev_acc)) logging.info('-' * 50) logging.info('Training Finished...') logging.info("Model saved in file: %s" % saver.save(sess, args.model_path))
embedding_size = 100 embedding_file = 'data/glove.6B/glove.6B.50d.txt' hidden_size = 128 embedding_file = None dropout_rate = 0.2 learning_rate=0.05 eval_iter = 10 batch_size = 10 file_name = '/Users/yangsun/Desktop/dataset/training_cnn.txt' val_file_name = '/Users/yangsun/Desktop/dataset/validation_cnn.txt' model_path = './model_path' documents, questions, answers = utils.load_data(file_name, 10) word_dict = utils.build_dict(documents + questions) documents_val, questions_val, answers_val = utils.load_data(val_file_name, 100) word_dict_val = utils.build_dict(documents_val + questions_val) entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + answers)) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} num_labels = len(entity_dict) embeddings = utils.gen_embeddings(word_dict, embedding_size, embedding_file) vocab_size, embedding_size = embeddings.shape # tf.reset_default_graph()
def main(args): logging.info('-' * 50) logging.info('Load data files..') question_belong = [] if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100) test_examples = dev_examples else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file) test_examples = utils.load_data(args.test_file) args.num_train = len(train_examples) args.num_dev = len(dev_examples) args.relations = len(train_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat) logging.info('-' * 50) logging.info('Build dictionary..') word_dicts, inv_word_dicts = utils.build_dict(train_examples, args.max_cat) default_value = [] for word_dict in word_dicts: default_value.append(word_dict['']) #logging.info(word_dicts[1]) #logging.info(inv_word_dicts[1]) #utils.store_labels_to_pkl(inv_word_dicts) #sys.exit(0) args.default_value = default_value embeddings = utils.gen_embeddings(word_dicts, args.embedding_size) train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) topk_acc=args.topk_accuracy #topk_acc=1 labels_data=[] if args.test_print_allowed: labels_data=pickle.load(open(labels_file, 'rb')) logging.info('-' * 50) logging.info('Intial test..') dev_data, dev_mask = utils.vectorize(dev_examples, word_dicts, args) all_dev = gen_examples(dev_data, dev_mask, args.batch_size) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) test_data, test_mask = utils.vectorize(test_examples, word_dicts, args, args.test_print_allowed, labels_data) all_test = gen_examples(test_data, test_mask, args.batch_size) test_acc = eval_acc(test_fn, all_test, inv_word_dicts, topk_acc, args.test_print_allowed, labels_data) logging.info('Test accuracy: %s %%' % str(test_acc)) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) #utils.store_labels_to_pkl(inv_word_dicts) # Training if args.num_epoches>0: logging.info('-' * 50) logging.info('Start training..') train_data, train_mask = utils.vectorize(train_examples, word_dicts, args) start_time = time.time() n_updates = 0 all_train_old = gen_examples(train_data, train_mask, args.batch_size) all_train=utils.oversample(all_train_old, args) no_progress=0 for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, inps in enumerate(all_train): train_loss = train_fn(*inps) if idx % 1000 == 0: #logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) train_data_sample = [train_data[j][samples] for j in range(args.relations)] train_mask_sample = [train_mask[j][samples] for j in range(args.relations)] sample_train = gen_examples(train_data_sample, train_mask_sample, args.batch_size) #acc = eval_acc(test_fn, sample_train) #logging.info('Train accuracy: %s %%' % str(acc)) dev_acc = eval_acc(test_fn, all_dev, inv_word_dicts, topk_acc) logging.info('Dev accuracy: %s %%' % str(dev_acc)) #test_acc = eval_acc(test_fn, all_test) #logging.info('Test accuracy: %s %%' % str(test_acc)) if dev_acc > best_acc: best_acc = dev_acc logging.info('Best dev accuracy!') utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates) no_progress=0 else: no_progress+=1 logging.info('Dev accuracy has not improved in the past %d evaluations' % no_progress) if no_progress>=MAX_NO_PROGRESS: logging.info("Reached the limit of stagnation. Exiting now...") sys.exit(0)
def train(topology, train_data_dir=None, test_data_dir=None, word_dict_path=None, label_dict_path=None, model_save_dir="models", batch_size=32, num_passes=10): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) use_default_data = (train_data_dir is None) if use_default_data: logger.info(("No training data are provided, " "use paddle.dataset.imdb to train the model.")) logger.info("please wait to build the word dictionary ...") word_dict = paddle.dataset.imdb.word_dict() train_reader = paddle.batch(paddle.reader.shuffle( lambda: paddle.dataset.imdb.train(word_dict)(), buf_size=51200), batch_size=100) test_reader = paddle.batch(lambda: paddle.dataset.imdb.test(word_dict) (), batch_size=100) class_num = 2 else: if word_dict_path is None or not os.path.exists(word_dict_path): logger.info(("word dictionary is not given, the dictionary " "is automatically built from the training data.")) build_dict(data_dir=train_data_dir, save_path=word_dict_path, use_col=1, cutoff_fre=5, insert_extra_words=["<UNK>"]) if not os.path.exists(label_dict_path): logger.info(("label dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the label dictionary to map the original string-typed # label into integer-typed index build_dict(data_dir=train_data_dir, save_path=label_dict_path, use_col=0) word_dict = load_dict(word_dict_path) lbl_dict = load_dict(label_dict_path) class_num = len(lbl_dict) logger.info("class number is : %d." % (len(lbl_dict))) train_reader = paddle.batch(paddle.reader.shuffle(reader.train_reader( train_data_dir, word_dict, lbl_dict), buf_size=51200), batch_size=batch_size) if test_data_dir is not None: # here, because training and testing data share a same format, # we still use the reader.train_reader to read the testing data. test_reader = paddle.batch(reader.train_reader( test_data_dir, word_dict, lbl_dict), batch_size=batch_size) else: test_reader = None dict_dim = len(word_dict) logger.info("length of word dictionary is : %d." % (dict_dim)) paddle.init(use_gpu=False, trainer_count=1) # network config cost, prob, label = topology(dict_dim, class_num) # create parameters parameters = paddle.parameters.create(cost) # create optimizer adam_optimizer = paddle.optimizer.Adam( learning_rate=1e-3, regularization=paddle.optimizer.L2Regularization(rate=1e-3), model_average=paddle.optimizer.ModelAverage(average_window=0.5)) # create trainer trainer = paddle.trainer.SGD(cost=cost, extra_layers=paddle.evaluator.auc( input=prob, label=label), parameters=parameters, update_equation=adam_optimizer) # begin training network feeding = {"word": 0, "label": 1} def _event_handler(event): """ Define end batch and end pass event handler """ if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: logger.info( "Pass %d, Batch %d, Cost %f, %s\n" % (event.pass_id, event.batch_id, event.cost, event.metrics)) if isinstance(event, paddle.event.EndPass): if test_reader is not None: result = trainer.test(reader=test_reader, feeding=feeding) logger.info("Test at Pass %d, %s \n" % (event.pass_id, result.metrics)) with gzip.open( os.path.join(model_save_dir, "cnn_params_pass_%05d.tar.gz" % event.pass_id), "w") as f: trainer.save_parameter_to_tar(f) trainer.train(reader=train_reader, event_handler=_event_handler, feeding=feeding, num_passes=num_passes) logger.info("Training has finished.")
def main(args): logging.info('-' * 50) logging.info('Load data files..') if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict(train_examples[0] + train_examples[1]) entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2])) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} logging.info('Entity markers: %d' % len(entity_dict)) args.num_labels = len(entity_dict) logging.info('-' * 50) # Load embedding file embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_l, dev_y = utils.vectorize(dev_examples, word_dict, entity_dict) assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_l, dev_y, args.batch_size) dev_acc = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_l, train_y = utils.vectorize(train_examples, word_dict, entity_dict) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_l, train_y, args.batch_size) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) in enumerate(all_train): logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y) logging.info('Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted(np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples([train_x1[k] for k in samples], [train_x2[k] for k in samples], train_l[samples], [train_y[k] for k in samples], args.batch_size) logging.info('Train accuracy: %.2f %%' % eval_acc(test_fn, sample_train)) logging.info('Dev accuracy: %.2f %%' % eval_acc(test_fn, all_dev)) if dev_acc > best_acc: best_acc = dev_acc logging.info('Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, params, epoch=epoch, n_updates=n_updates)
def main(args): # load sentences (English and Chinese words) train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) # build English and Chinese dictionary if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb")) else: en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) args.en_total_words = en_total_words args.cn_total_words = cn_total_words # index to words dict inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} # encode train and dev sentences into indieces train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) # convert to numpy tensors train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) # code.interact(local=locals()) if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() crit = utils.LanguageModelCriterion() print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss) ) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) best_acc = acc learning_rate = args.learning_rate optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): batch_size = mb_x.shape[0] total_num_sentences += batch_size # convert numpy ndarray to PyTorch tensors and variables mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:,:-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words optimizer.zero_grad() loss.backward() optimizer.step() print("training loss: %f" % (total_train_loss / total_num_words)) # evaluate every eval_epoch if (epoch+1) % args.eval_epoch == 0: print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss) ) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) # save model if we have the best accuracy if acc >= best_acc: torch.save(model, args.model_file) best_acc = acc print("model saved...") else: learning_rate *= 0.5 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) print("best dev accuracy: %f" % best_acc) print("#" * 60) # load test data test_en, test_cn = utils.load_data(args.test_file) args.num_test = len(test_en) test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict) test_data = utils.gen_examples(test_en, test_cn, args.batch_size) # evaluate on test correct_count, loss, num_words = eval(model, test_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss) ) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words)) # evaluate on train correct_count, loss, num_words = eval(model, train_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("train loss %s" % (loss) ) print("train accuracy %f" % (acc))
from __future__ import print_function from textblob import TextBlob import utils if __name__ == "__main__": result = utils.ReadDirsToStem("../20news-18828/alt.atheism") with open(file="../data/tf.txt", mode="w", encoding="ISO-8859-1") as f: f.write(str(result)) with open(file="../data/tf.txt", mode="r", encoding="ISO-8859-1") as f: result_new = eval(f.read()) utils.build_dict(result_new, frequency=50) print(len(utils.load_dict(filePath="../data/dict_50.txt")))
def main(args): logging.info('-' * 50) logging.info('Load data files..') question_belong = [] if args.debug: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, 100, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, 100, relabeling=args.relabeling, question_belong=question_belong) else: logging.info('*' * 10 + ' Train') train_examples = utils.load_data(args.train_file, relabeling=args.relabeling) logging.info('*' * 10 + ' Dev') dev_examples = utils.load_data(args.dev_file, args.max_dev, relabeling=args.relabeling, question_belong=question_belong) args.num_train = len(train_examples[0]) args.num_dev = len(dev_examples[0]) logging.info('-' * 50) logging.info('Build dictionary..') word_dict = utils.build_dict( train_examples[0] + train_examples[1] + train_examples[2], args.max_vocab_size) logging.info('-' * 50) embeddings = utils.gen_embeddings(word_dict, args.embedding_size, args.embedding_file) (args.vocab_size, args.embedding_size) = embeddings.shape logging.info('Compile functions..') train_fn, test_fn, params, all_params = build_fn(args, embeddings) logging.info('Done.') logging.info('-' * 50) logging.info(args) logging.info('-' * 50) logging.info('Intial test..') dev_x1, dev_x2, dev_x3, dev_y = utils.vectorize( dev_examples, word_dict, sort_by_len=not args.test_only, concat=args.concat) word_dict_r = {} word_dict_r[0] = "unk" assert len(dev_x1) == args.num_dev all_dev = gen_examples(dev_x1, dev_x2, dev_x3, dev_y, args.batch_size, args.concat) dev_acc, pred = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) best_acc = dev_acc if args.test_only: return utils.save_params(args.model_file, all_params, epoch=0, n_updates=0) # Training logging.info('-' * 50) logging.info('Start training..') train_x1, train_x2, train_x3, train_y = utils.vectorize(train_examples, word_dict, concat=args.concat) assert len(train_x1) == args.num_train start_time = time.time() n_updates = 0 all_train = gen_examples(train_x1, train_x2, train_x3, train_y, args.batch_size, args.concat) for epoch in range(args.num_epoches): np.random.shuffle(all_train) for idx, (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y) in enumerate(all_train): train_loss = train_fn(mb_x1, mb_mask1, mb_x2, mb_mask2, mb_x3, mb_mask3, mb_y) if idx % 100 == 0: logging.info('#Examples = %d, max_len = %d' % (len(mb_x1), mb_x1.shape[1])) logging.info( 'Epoch = %d, iter = %d (max = %d), loss = %.2f, elapsed time = %.2f (s)' % (epoch, idx, len(all_train), train_loss, time.time() - start_time)) n_updates += 1 if n_updates % args.eval_iter == 0: samples = sorted( np.random.choice(args.num_train, min(args.num_train, args.num_dev), replace=False)) sample_train = gen_examples( [train_x1[k] for k in samples], [train_x2[k] for k in samples], [train_x3[k * 4 + o] for k in samples for o in range(4)], [train_y[k] for k in samples], args.batch_size, args.concat) acc, pred = eval_acc(test_fn, sample_train) logging.info('Train accuracy: %.2f %%' % acc) dev_acc, pred = eval_acc(test_fn, all_dev) logging.info('Dev accuracy: %.2f %%' % dev_acc) if dev_acc > best_acc: best_acc = dev_acc logging.info( 'Best dev accuracy: epoch = %d, n_udpates = %d, acc = %.2f %%' % (epoch, n_updates, dev_acc)) utils.save_params(args.model_file, all_params, epoch=epoch, n_updates=n_updates)
type=str) parser.add_argument('--test_date_path', default='./dateset/cnn/questions/test', type=str) parser.add_argument('--glove_path', default='/nfs/users/guanxin/cache/.vector_cache', type=str) config = parser.parse_args() documents, questions, answers, doc_len, qus_len = utils.load_data( config.train_date_path, config.train_num, True) test_documents, test_questions, test_answers, test_doc_len, test_qus_len = utils.load_data( config.test_date_path, 3000, True) # build word dict word_dict = utils.build_dict(documents + questions) embedding = Parameter(utils.embedding_word(word_dict, config.glove_path)) # build entity dict (numbers of categories) entity_markers = list( set([w for w in word_dict.keys() if w.startswith('@entity')] + answers)) entity_markers = ['<unk_entity>'] + entity_markers entity_dict = {w: index for (index, w) in enumerate(entity_markers)} doc_maxlen = max(map(len, (d.split(' ') for d in documents))) query_maxlen = max(map(len, (q.split(' ') for q in questions))) # data preprocessing, convert to one-hot train_x1, train_x2, train_l, train_y = utils.vectorize(documents, questions, answers, word_dict, entity_dict, doc_maxlen,
def train(topology, train_data_dir=None, test_data_dir=None, word_dict_path=None, label_dict_path=None, model_save_dir="models", use_cuda=False, window_size=5, learning_rate=0.001, batch_size=64, num_passes=10): """ train window_net model or sentence_net model :params train_data_path: path of training data, if this parameter is not specified, Brown Corpus will be used to run this example :type train_data_path: str :params test_data_path: path of testing data, if this parameter is not specified, Brown Corpus will be used to run this example :type test_data_path: str :params word_dict_path: path of word dictionary data, if this parameter is not specified, a default dictionary file will be used to run this example :type word_dict_path: str :params label_dict_path: path of label dictionary data, if this parameter is not specified, a default dictionary file will be used to run this example :type label_dict_path: str :params use_cuda: whether use the cuda :type use_cuda: bool :params window_size: size of window width :type window_size: int :params num_pass: train pass number :type num_pass: int """ if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) use_default_data = (train_data_dir is None) if use_default_data: logger.info(("No training data are provided, " "use Brown corpus to train the model.")) logger.info("downloading Brown corpus...") train_data_dir, test_data_dir, word_dict_path, label_dict_path = load_default_data( ) logger.info("please wait to build the word dictionary ...") if word_dict_path is None or not os.path.exists(word_dict_path): logger.info(("word dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the word dictionary to map the original string-typed # words into integer-typed index build_dict(data_dir=train_data_dir, save_path=word_dict_path, use_col=0, cutoff_fre=1, insert_extra_words=["<UNK>"]) logger.info("the word dictionary path is %s" % word_dict_path) if not os.path.exists(label_dict_path): logger.info(("label dictionary is not given, the dictionary " "is automatically built from the training data.")) # build the label dictionary to map the original string-typed # label into integer-typed index build_dict(data_dir=train_data_dir, save_path=label_dict_path, use_col=1, cutoff_fre=10, insert_extra_words=["<UNK>"]) logger.info("the label dictionary path is %s" % label_dict_path) # get index info word_dict = load_dict(word_dict_path) lbl_dict = load_dict(label_dict_path) class_num = len(lbl_dict) logger.info("class number is : %d." % (len(lbl_dict))) # get train data reader train_reader = paddle.batch(paddle.reader.shuffle(reader.train_reader( train_data_dir, word_dict, lbl_dict, window_size), buf_size=51200), batch_size=batch_size) # get test data reader if test_data_dir is not None: # here, because training and testing data share a same format, # we still use the reader.train_reader to read the testing data. test_reader = paddle.batch(reader.train_reader(test_data_dir, word_dict, lbl_dict, window_size), batch_size=batch_size) else: test_reader = None # get size of word dictionary dict_dim = len(word_dict) + 1 logger.info("length of word dictionary is : %d." % (dict_dim)) # define the input layers data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") # return the network result cost, acc, prediction = topology(data, label, dict_dim, class_num=class_num) # create optimizer sgd_optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) sgd_optimizer.minimize(cost) # create trainer place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place) # initialize training network exe.run(fluid.default_startup_program()) prog = fluid.default_main_program() # begin training network for pass_id in range(num_passes): ## running the train data data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 for i, data_ in enumerate(train_reader()): avg_cost_np, avg_acc_np = exe.run(prog, feed=feeder.feed(data_), fetch_list=[cost, acc]) data_size = len(data_) total_acc += data_size * avg_acc_np total_cost += data_size * avg_cost_np data_count += data_size if (i + 1) % 1000 == 0: logger.info( "pass_id: %d, batch %d, avg_acc: %f, avg_cost: %f" % (pass_id, i + 1, total_acc / data_count, total_cost / data_count)) avg_cost = total_cost / data_count avg_acc = total_acc / data_count logger.info("Train result -- pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) ## running the test data if test_reader is not None: data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 for i, data in enumerate(test_reader()): avg_cost_np, avg_acc_np, prediction_np = exe.run( prog, feed=feeder.feed(data), fetch_list=[cost, acc, prediction]) data_size = len(data) total_acc += data_size * avg_acc_np total_cost += data_size * avg_cost_np data_count += data_size avg_cost = total_cost / data_count avg_acc = total_acc / data_count logger.info( "Test result -- pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost)) ## save inference model epoch_model = model_save_dir + "/" + args.nn_type + "_epoch" + str( pass_id % 5) logger.info("Saving inference model at %s" % (epoch_model)) ##prediction is the topology return value ##if we use the prediction value as the infer result fluid.io.save_inference_model(epoch_model, ["words"], prediction, exe) logger.info("Training has finished.")
import tensorflow as tf import pickle from model import Model from utils import build_dict, build_dataset, batch_iter, build_deploy import numpy as np import time t2 = time.time() with open("args.pickle", "rb") as f: args = pickle.load(f) str_from = 'five-time world champion michelle kwan withdrew from the #### us figure skating championships on wednesday , but will petition us skating officials for the chance to compete at the #### turin olympics .' print("Loading dictionary...") word_dict, reversed_dict, article_max_len, summary_max_len = build_dict( "valid", args.toy) valid_x, valid_y = build_deploy(str_from, word_dict, article_max_len, summary_max_len) valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x)) sess = tf.InteractiveSession() print("Loading saved model...") t1 = time.time() model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state("./saved_model/") saver.restore(sess, ckpt.model_checkpoint_path)
print('Processing train files') train_sentences, train_labels = utils.get_sen_and_labels(train_files) print('Processing val files') val_sentences, val_labels = utils.get_sen_and_labels(val_files) train_size = len(train_sentences) val_size = len(val_sentences) print_after = train_size // (FLAGS.num_gpus * FLAGS.batch_size) val_steps = val_size // (FLAGS.num_gpus * FLAGS.batch_size) max_steps = FLAGS.num_epochs * print_after sentences = train_sentences + val_sentences labels = train_labels + val_labels word2idx, idx2word = utils.build_dict(sentences, True) label2idx, idx2label = utils.build_dict(labels, False) vocabulary_size = len(word2idx) train_gen = utils.batches_generator(train_size, train_sentences, train_labels, word2idx, label2idx) val_gen = utils.batches_generator(val_size, val_sentences, val_labels, word2idx, label2idx) X_train, y_train = next(train_gen) X_val, y_val = next(val_gen) #print(X_train, y_train) assert X_train.shape[0] == y_train.shape[0], 'train vectors shape mismatch' assert X_val.shape[0] == y_val.shape[0], 'val vectors shape mismatch'