def generate_summaries(path): from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')] for fedus in doclist: pred_rst = parse(pm, fedus=fedus) # You can pass in either summary_factor or summary_p # summary_p hardcodes it to p sentences # summary_factor is a percentage of the edu length top_scoring = calc_marcu(pred_rst, summary_p = 2) summary_fname = fedus.replace('.edus', '.summary') s = [] # Get top scoring and format it appropriately for edu in top_scoring: edu.text = edu.text.strip() str_array = word_tokenize(edu.text) # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph pp_indices = [i for i,x in enumerate(edu.tags) if x == 'VBG'] pp_phrase_indices = set(pp_indices) for idx in pp_indices: pp_phrase_indices.add(edu.head_words_indices[idx] - 1) # Remove all PP phrase from sentence new_str_array = [v for i,v in enumerate(str_array) if i not in pp_phrase_indices] edu.text = ' '.join(new_str_array) # Remove initial adverbials for stop_phrase in adverbial_phr_list: # If at beginning, remove it if edu.text.find(stop_phrase + ', ') == 0: edu.text = edu.text.replace(stop_phrase + ', ', '').strip() elif edu.text.find(stop_phrase) == 0: edu.text = edu.text.replace(stop_phrase, '').strip() # Format so capitalization is correct for our new sentence caps = edu.text.upper() edu.text = list(edu.text) edu.text[0] = caps[0] edu.text = "".join(edu.text) s.append(str(edu.text)) # Form raw sentences for summary from chosen edu's s = ' '.join(s).replace('\t', '').strip() # Now do simplification step f = open(summary_fname, 'w') f.write(s) f.close()
def trainmodel(): """ Training a model with data and save it into file """ fvocab = "vocab.pickle.gz" fdata = "training-data.pickle.gz" D = load(gzip.open(fvocab)) vocab, labelidxmap = D['vocab'], D['labelidxmap'] D = load(gzip.open(fdata)) trnM, trnL = D['matrix'], D['labels'] idxlabelmap = reversedict(labelidxmap) pm = ParsingModel(vocab=vocab, idxlabelmap=idxlabelmap) pm.train(trnM, trnL) pm.savemodel("parsing-model.pickle.gz")
def evalparser(path='./examples', report=False, bcvocab=None, draw=True, withdp=False, fdpvocab=None, fprojmat=None): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ # ---------------------------------------- # Load the parsing model print 'Load parsing model ...' pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat) pm.loadmodel("model/parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')] for fmerge in doclist: # ---------------------------------------- # Read *.merge file dr = DocReader() doc = dr.read(fmerge) # ---------------------------------------- # Parsing pred_rst = pm.sr_parse(doc, bcvocab) if draw: strtree = pred_rst.parse() drawrst(strtree, fmerge.replace(".merge",".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fmerge.replace('.merge', '.dis') gold_rst = RSTTree(fdis, fmerge) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
def evalparser(path='./examples', report=False): """ Test the parsing performance :type path: string :param path: path to the evaluation data :type report: boolean :param report: whether to report (calculate) the f1 score """ from os import listdir from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Evaluation met = Metrics(levels=['span','nuclearity','relation']) # ---------------------------------------- # Read all files from the given path doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')] for fedus in doclist: # ---------------------------------------- # Parsing fpos = fedus + ".pos" d_pos = get_d_pos(fpos) fdep = fedus + ".dep" d_dep = get_d_dep(fdep) pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fedus.replace('edus', 'brackets') writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fedus.replace('edus', 'dis') gold_rst = RSTTree(fname=fdis) gold_rst.build() gold_brackets = gold_rst.bracketing() met.eval(gold_rst, pred_rst) if report: met.report()
""" Parameters for composition :type L: 2-d numpy.array :param L: composition matrix for left node :type R: 2-d numpy.array :param R: composition matrix for right node :type bias: 1-d numpy.array :param bias: composition bias """ self.S = S self.N = N self.bias = bias if __name__ == '__main__': D =loadmodel("weights.pickle.gz") weights = D["words"] vocab = D["vocab"] vocab_no = D["vocabno"] pm = ParsingModel() pm.loadmodel("../parsing-model.pickle.gz") path = "../../../Movies/edu-input-final/" path = "../../../Movies/Bigger-set/" files = [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith('.edus')] # param = miniKJointSGD(files,400,sa_dict,iterations=40) param = miniHingeJointTopSGD(pm,files,1500,weights,iterations=100) print param.N print param.S
def generate_summaries(path): from os.path import join as joinpath # ---------------------------------------- # Load the parsing model pm = ParsingModel() pm.loadmodel("parsing-model.pickle.gz") # ---------------------------------------- # Read all files from the given path doclist = [ joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus') ] for fedus in doclist: pred_rst = parse(pm, fedus=fedus) # You can pass in either summary_factor or summary_p # summary_p hardcodes it to p sentences # summary_factor is a percentage of the edu length top_scoring = calc_marcu(pred_rst, summary_p=2) summary_fname = fedus.replace('.edus', '.summary') s = [] # Get top scoring and format it appropriately for edu in top_scoring: edu.text = edu.text.strip() str_array = word_tokenize(edu.text) # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph pp_indices = [i for i, x in enumerate(edu.tags) if x == 'VBG'] pp_phrase_indices = set(pp_indices) for idx in pp_indices: pp_phrase_indices.add(edu.head_words_indices[idx] - 1) # Remove all PP phrase from sentence new_str_array = [ v for i, v in enumerate(str_array) if i not in pp_phrase_indices ] edu.text = ' '.join(new_str_array) # Remove initial adverbials for stop_phrase in adverbial_phr_list: # If at beginning, remove it if edu.text.find(stop_phrase + ', ') == 0: edu.text = edu.text.replace(stop_phrase + ', ', '').strip() elif edu.text.find(stop_phrase) == 0: edu.text = edu.text.replace(stop_phrase, '').strip() # Format so capitalization is correct for our new sentence caps = edu.text.upper() edu.text = list(edu.text) edu.text[0] = caps[0] edu.text = "".join(edu.text) s.append(str(edu.text)) # Form raw sentences for summary from chosen edu's s = ' '.join(s).replace('\t', '').strip() # Now do simplification step f = open(summary_fname, 'w') f.write(s) f.close()
def train(config): word_mat = np.array(data_helper.load_word_embedding(config.word_emb_file), dtype=np.float32) print("Building model...") # data_manager = DataManager(config) train_graph = tf.Graph() dev_graph = tf.Graph() parser = data_helper.get_record_parser(config) train_dataset = data_helper.get_batch_dataset(config.train_file, parser, config, config.batch_size) dev_dataset = data_helper.get_batch_dataset(config.dev_file, parser, config, config.eval_batch_size, is_train=False) # initialize train model and dev model separately with train_graph.as_default(): train_iterator_manager = IteratorManager(train_dataset) train_model = ParsingModel(config, train_iterator_manager.iterator, word_mat) initializer = tf.global_variables_initializer() with dev_graph.as_default(): dev_iterator_manager = IteratorManager(dev_dataset) dev_model = ParsingModel(config, dev_iterator_manager.iterator, word_mat, is_train=False) checkpoints_path = os.path.join(config.save_dir, "checkpoints") # initialize train and dev session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True train_sess = tf.Session(graph=train_graph, config=sess_config) dev_sess = tf.Session(graph=dev_graph, config=sess_config) train_sess.run(initializer) train_iterator_manager.get_string_handle(train_sess) dev_iterator_manager.get_string_handle(dev_sess) summary_writer = SummaryWriter(config.log_dir) lr_updater = LearningRateUpdater(patience=3, init_lr=config.init_lr, loss_save=100.0) lr_updater.assign(train_sess, train_model) # checkpoint_path = tf.train.latest_checkpoint(config.save_dir, latest_filename=None) # train_model.saver.restore(train_sess, checkpoint_path) for _ in xrange(1, config.num_steps + 1): global_step = train_sess.run(train_model.global_step) + 1 loss, accuracy, train_op, grad_summ = train_sess.run( [ train_model.loss, train_model.accuracy, train_model.train_op, train_model.grad_summ ], feed_dict=train_iterator_manager.make_feed_dict()) if global_step % config.period == 0: tf.logging.info("training step: step {} adding loss: {}".format( global_step, loss)) summ = model_summary('model', loss, accuracy) summ += [grad_summ] summary_writer.write_summaries(summ, global_step) summary_writer.flush() if global_step % config.checkpoint == 0: # lr_updater.setZero(train_sess, train_model) tf.logging.info("training step: step {} checking the model".format( global_step)) checkpoint_path = train_model.saver.save(train_sess, checkpoints_path, global_step=global_step) # summ = evaluate_batch(train_model, config.val_num_batches, train_sess, "train", train_iterator_manager) # summary_writer.write_summaries(summ, global_step) dev_model.saver.restore(dev_sess, checkpoint_path) summ = evaluate_batch(dev_model, config.dev_val_num_sentences, dev_sess, "dev", dev_iterator_manager) summary_writer.write_summaries(summ, global_step) summary_writer.flush() test(config)