def generate_summaries(path):
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")

    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')]
    for fedus in doclist:
        pred_rst = parse(pm, fedus=fedus)

        # You can pass in either summary_factor or summary_p
        # summary_p hardcodes it to p sentences
        # summary_factor is a percentage of the edu length
        top_scoring = calc_marcu(pred_rst, summary_p = 2)
        summary_fname = fedus.replace('.edus', '.summary')
        s = []

        # Get top scoring and format it appropriately
        for edu in top_scoring:
            edu.text = edu.text.strip()
            str_array = word_tokenize(edu.text)

            # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph
            pp_indices = [i for i,x in enumerate(edu.tags) if x == 'VBG']
            pp_phrase_indices = set(pp_indices)
            for idx in pp_indices:
                pp_phrase_indices.add(edu.head_words_indices[idx] - 1)

            # Remove all PP phrase from sentence
            new_str_array = [v for i,v in enumerate(str_array) if i not in pp_phrase_indices]
            edu.text = ' '.join(new_str_array)

            # Remove initial adverbials
            for stop_phrase in adverbial_phr_list:
                # If at beginning, remove it
                if edu.text.find(stop_phrase + ', ') == 0:
                    edu.text = edu.text.replace(stop_phrase + ', ', '').strip()
                elif edu.text.find(stop_phrase) == 0:
                    edu.text = edu.text.replace(stop_phrase, '').strip()

            # Format so capitalization is correct for our new sentence
            caps = edu.text.upper()
            edu.text = list(edu.text)
            edu.text[0] = caps[0]
            edu.text = "".join(edu.text)
            s.append(str(edu.text))

        # Form raw sentences for summary from chosen edu's
        s = ' '.join(s).replace('\t', '').strip()

        # Now do simplification step

        f = open(summary_fname, 'w')
        f.write(s)
        f.close()
예제 #2
0
파일: main.py 프로젝트: parry2403/CodeRepo
def trainmodel():
    """ Training a model with data and save it into file
    """
    fvocab = "vocab.pickle.gz"
    fdata = "training-data.pickle.gz"
    D = load(gzip.open(fvocab))
    vocab, labelidxmap = D['vocab'], D['labelidxmap']
    D = load(gzip.open(fdata))
    trnM, trnL  = D['matrix'], D['labels']
    idxlabelmap = reversedict(labelidxmap)
    pm = ParsingModel(vocab=vocab, idxlabelmap=idxlabelmap)
    pm.train(trnM, trnL)
    pm.savemodel("parsing-model.pickle.gz")
예제 #3
0
파일: evalparser.py 프로젝트: OlafLee/DPLP
def evalparser(path='./examples', report=False, 
               bcvocab=None, draw=True,
               withdp=False, fdpvocab=None, fprojmat=None):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    # ----------------------------------------
    # Load the parsing model
    print 'Load parsing model ...'
    pm = ParsingModel(withdp=withdp,
        fdpvocab=fdpvocab, fprojmat=fprojmat)
    pm.loadmodel("model/parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')]
    for fmerge in doclist:
        # ----------------------------------------
        # Read *.merge file
        dr = DocReader()
        doc = dr.read(fmerge)
        # ----------------------------------------
        # Parsing
        pred_rst = pm.sr_parse(doc, bcvocab)
        if draw:
            strtree = pred_rst.parse()
            drawrst(strtree, fmerge.replace(".merge",".ps"))
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fmerge.replace('.merge', '.brackets')
        # Write brackets into file
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fmerge.replace('.merge', '.dis')
            gold_rst = RSTTree(fdis, fmerge)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
예제 #4
0
def evalparser(path='./examples', report=False):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    from os import listdir
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')]
    for fedus in doclist:
        # ----------------------------------------
        # Parsing
        fpos = fedus + ".pos"
        d_pos = get_d_pos(fpos)
        fdep = fedus + ".dep"
        d_dep = get_d_dep(fdep)
        pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep)
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fedus.replace('edus', 'brackets')
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fedus.replace('edus', 'dis')
            gold_rst = RSTTree(fname=fdis)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
예제 #5
0
파일: joint.py 프로젝트: parry2403/R2N2
            """ Parameters for composition

            :type L: 2-d numpy.array
            :param L: composition matrix for left node

            :type R: 2-d numpy.array
            :param R: composition matrix for right node

            :type bias: 1-d numpy.array
            :param bias: composition bias
            """
            self.S = S
            self.N = N
            self.bias = bias
if __name__ == '__main__':
    
    D =loadmodel("weights.pickle.gz")

    weights = D["words"]
    vocab = D["vocab"]
    vocab_no = D["vocabno"]
    pm = ParsingModel()
    pm.loadmodel("../parsing-model.pickle.gz")
    path = "../../../Movies/edu-input-final/"
    path = "../../../Movies/Bigger-set/"
    files = [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith('.edus')]
    # param = miniKJointSGD(files,400,sa_dict,iterations=40)
    param = miniHingeJointTopSGD(pm,files,1500,weights,iterations=100)
    print param.N
    print param.S
    
def generate_summaries(path):
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")

    # ----------------------------------------
    # Read all files from the given path
    doclist = [
        joinpath(path, fname) for fname in listdir(path)
        if fname.endswith('.edus')
    ]
    for fedus in doclist:
        pred_rst = parse(pm, fedus=fedus)

        # You can pass in either summary_factor or summary_p
        # summary_p hardcodes it to p sentences
        # summary_factor is a percentage of the edu length
        top_scoring = calc_marcu(pred_rst, summary_p=2)
        summary_fname = fedus.replace('.edus', '.summary')
        s = []

        # Get top scoring and format it appropriately
        for edu in top_scoring:
            edu.text = edu.text.strip()
            str_array = word_tokenize(edu.text)

            # Remove PP phrase by finding index of the VBG and removing all words that point to it in dependency graph
            pp_indices = [i for i, x in enumerate(edu.tags) if x == 'VBG']
            pp_phrase_indices = set(pp_indices)
            for idx in pp_indices:
                pp_phrase_indices.add(edu.head_words_indices[idx] - 1)

            # Remove all PP phrase from sentence
            new_str_array = [
                v for i, v in enumerate(str_array)
                if i not in pp_phrase_indices
            ]
            edu.text = ' '.join(new_str_array)

            # Remove initial adverbials
            for stop_phrase in adverbial_phr_list:
                # If at beginning, remove it
                if edu.text.find(stop_phrase + ', ') == 0:
                    edu.text = edu.text.replace(stop_phrase + ', ', '').strip()
                elif edu.text.find(stop_phrase) == 0:
                    edu.text = edu.text.replace(stop_phrase, '').strip()

            # Format so capitalization is correct for our new sentence
            caps = edu.text.upper()
            edu.text = list(edu.text)
            edu.text[0] = caps[0]
            edu.text = "".join(edu.text)
            s.append(str(edu.text))

        # Form raw sentences for summary from chosen edu's
        s = ' '.join(s).replace('\t', '').strip()

        # Now do simplification step

        f = open(summary_fname, 'w')
        f.write(s)
        f.close()
예제 #7
0
def train(config):
    word_mat = np.array(data_helper.load_word_embedding(config.word_emb_file),
                        dtype=np.float32)

    print("Building model...")
    # data_manager = DataManager(config)

    train_graph = tf.Graph()
    dev_graph = tf.Graph()

    parser = data_helper.get_record_parser(config)
    train_dataset = data_helper.get_batch_dataset(config.train_file, parser,
                                                  config, config.batch_size)
    dev_dataset = data_helper.get_batch_dataset(config.dev_file,
                                                parser,
                                                config,
                                                config.eval_batch_size,
                                                is_train=False)

    # initialize train model and dev model separately
    with train_graph.as_default():
        train_iterator_manager = IteratorManager(train_dataset)
        train_model = ParsingModel(config, train_iterator_manager.iterator,
                                   word_mat)
        initializer = tf.global_variables_initializer()

    with dev_graph.as_default():
        dev_iterator_manager = IteratorManager(dev_dataset)
        dev_model = ParsingModel(config,
                                 dev_iterator_manager.iterator,
                                 word_mat,
                                 is_train=False)

    checkpoints_path = os.path.join(config.save_dir, "checkpoints")

    # initialize train and dev session
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    train_sess = tf.Session(graph=train_graph, config=sess_config)
    dev_sess = tf.Session(graph=dev_graph, config=sess_config)

    train_sess.run(initializer)
    train_iterator_manager.get_string_handle(train_sess)
    dev_iterator_manager.get_string_handle(dev_sess)

    summary_writer = SummaryWriter(config.log_dir)

    lr_updater = LearningRateUpdater(patience=3,
                                     init_lr=config.init_lr,
                                     loss_save=100.0)
    lr_updater.assign(train_sess, train_model)

    # checkpoint_path = tf.train.latest_checkpoint(config.save_dir, latest_filename=None)
    # train_model.saver.restore(train_sess, checkpoint_path)

    for _ in xrange(1, config.num_steps + 1):

        global_step = train_sess.run(train_model.global_step) + 1

        loss, accuracy, train_op, grad_summ = train_sess.run(
            [
                train_model.loss, train_model.accuracy, train_model.train_op,
                train_model.grad_summ
            ],
            feed_dict=train_iterator_manager.make_feed_dict())

        if global_step % config.period == 0:
            tf.logging.info("training step: step {} adding loss: {}".format(
                global_step, loss))
            summ = model_summary('model', loss, accuracy)
            summ += [grad_summ]
            summary_writer.write_summaries(summ, global_step)
            summary_writer.flush()

        if global_step % config.checkpoint == 0:
            # lr_updater.setZero(train_sess, train_model)
            tf.logging.info("training step: step {} checking the model".format(
                global_step))
            checkpoint_path = train_model.saver.save(train_sess,
                                                     checkpoints_path,
                                                     global_step=global_step)

            # summ = evaluate_batch(train_model, config.val_num_batches, train_sess, "train", train_iterator_manager)
            # summary_writer.write_summaries(summ, global_step)

            dev_model.saver.restore(dev_sess, checkpoint_path)
            summ = evaluate_batch(dev_model, config.dev_val_num_sentences,
                                  dev_sess, "dev", dev_iterator_manager)
            summary_writer.write_summaries(summ, global_step)

            summary_writer.flush()
    test(config)