Exemplo n.º 1
0
    def __init__(self, **kwargs):
        super(SummarizationBaseline, self).__init__(**kwargs)
        self.spot_field = EVENT_SPOT_FIELD

        self.evaluator = SalienceEva(**kwargs)

        lang = 'english'
        stemmer = Stemmer(lang)
        self.summarizer = Summarizer(stemmer)
        self.summarizer.stop_words = get_stop_words(lang)

        self.h_event_id = pickle.load(open(self.event_id_pickle_in))
Exemplo n.º 2
0
    def evaluate_normal(self, docs, f_predict):
        print("Evaluating predictions [%s] from [%s]." % (f_predict, docs))
        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        e_p = 0
        p = 0
        skip = 0

        for res in self.load_pairs(docs, f_predict):
            p += 1
            if not res:
                e_p += 1
            else:
                skip += 1
                continue

            predictions, s_e_label, s_evm_label = res

            l_e_pack = self.get_e_labels(predictions, s_e_label)

            if l_e_pack:
                h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

            if not e_p == 0:
                h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
                sys.stdout.write('\rEvaluated %d files, %d with entities,'
                                 ' %d line skipped. P@1: %s.' %
                                 (p, e_p, skip, h_e_mean_eva['p@01']))

        print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        res = {'entity': h_e_mean_eva}

        with open(f_predict + '.entity.eval', 'w') as out:
            json.dump(res, out, indent=1)
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        super(FeatureBasedBaseline, self).__init__(**kwargs)
        if self.event_model:
            self.spot_field = 'event'

        self.io = EventDataIO(**kwargs)

        self.evaluator = SalienceEva(**kwargs)
        self.feature_names_split = self.feature_names.split(",")
        self.feature_dim = len(self.feature_names_split)

        reverse_f = set(self.reverse_feature.split(","))

        # Mask to identify which features should be ranked reversely.
        self.reverse_dim = []
        for i, n in enumerate(self.feature_names_split):
            self.reverse_dim.append(n in reverse_f)

        if self.feature_dim == 0:
            logging.error("You must provide feature names.")
        else:
            logging.info("Number of features to check: %d" % self.feature_dim)
Exemplo n.º 4
0
    def __init__(self, **kwargs):
        super(SalienceModelCenter, self).__init__(**kwargs)
        self.para = NNPara(**kwargs)
        self.ext_data = ExtData(**kwargs)
        self.ext_data.assert_with_para(self.para)
        self._setup_io(**kwargs)
        h_loss = {
            "hinge": hinge_loss,  # hinge classification loss does not work
            "pairwise": pairwise_loss,
        }
        self.criterion = h_loss[self.loss_func]
        self.class_weight = torch.cuda.FloatTensor(self.l_class_weights)

        # if self.event_model and self.joint_model:
        #     logging.error("Please specify one mode only.")
        #     exit(1)

        self.evaluator = SalienceEva(**kwargs)
        self._init_model()

        self.patient_cnt = 0
        self.best_valid_loss = 0
        self.ll_valid_line = []
Exemplo n.º 5
0
    def compare(self, docs, f_predict_1, f_predict_2,
                entity_vocab_size, content_field='bodyText'):
        print("Comparing predictions [%s] from [%s]." % (
            f_predict_1, f_predict_2))
        evaluator = SalienceEva()  # evaluator with default values.

        p = 0

        for res in self.load_pairs(docs, f_predict_1, f_predict_2,
                                   content_field, entity_vocab_size):
            p += 1

            doc, (l_e_pack1, l_evm_pack1), (l_e_pack2, l_evm_pack2) = res
            words, entities, events, adjacent = self.get_targets(doc)

            print('Comparing doc %s' % (doc['docno']))

            print('Words are:')
            print(' '.join(words))
            print('Events are:')
            print(', '.join(events))

            if l_evm_pack1 and l_evm_pack2:
                print('comparing event ranking.')
                # h_evm1 = evaluator.evaluate(l_evm_pack1[0], l_evm_pack1[1])
                # h_evm2 = evaluator.evaluate(l_evm_pack2[0], l_evm_pack2[1])
                self.compare_ranking(l_evm_pack1, l_evm_pack2, self.h_event)
                print('showing adjacent list.')
                print([item for item in zip(events, adjacent) if item[1]])

            if l_e_pack1 and l_e_pack2:
                print('comparing entity ranking.')
                # h_e1 = evaluator.evaluate(l_e_pack1[0], l_e_pack1[1])
                # h_e2 = evaluator.evaluate(l_e_pack2[0], l_e_pack2[1])
                self.compare_ranking(l_e_pack1, l_e_pack2, self.h_entity)

            if l_e_pack1 and l_evm_pack1:
                print('Showing graph.')
                evm_adj, e_adj = self.show_graph(l_e_pack1, l_evm_pack1,
                                                 adjacent)
                print('Salient event adjacent:')
                print(evm_adj)
                print('Salient entity adjacent:')
                print(e_adj)

            sys.stdin.readline()
Exemplo n.º 6
0
class SummarizationBaseline(Configurable):
    # A specific field is reserved to mark the salience answer.
    salience_gold = Unicode(salience_gold)

    corpus_in = Unicode(help='input in text version').tag(config=True)
    test_out = Unicode(help='output').tag(config=True)

    event_id_pickle_in = Unicode(help='pickle of event id').tag(config=True)

    def __init__(self, **kwargs):
        super(SummarizationBaseline, self).__init__(**kwargs)
        self.spot_field = EVENT_SPOT_FIELD

        self.evaluator = SalienceEva(**kwargs)

        lang = 'english'
        stemmer = Stemmer(lang)
        self.summarizer = Summarizer(stemmer)
        self.summarizer.stop_words = get_stop_words(lang)

        self.h_event_id = pickle.load(open(self.event_id_pickle_in))

    def get_event_head(self, event_info):
        for f in event_info['feature']['sparseFeatureArray']:
            if f.startswith('LexicalHead_'):
                return f.split('_')[1]

    def is_empty(self, data):
        l_s = data[self.spot_field].get(body_field, [])
        return not l_s

    def process(self):
        h_total_eva = {}
        with gzip.open(self.corpus_in) as test_in, \
                open(self.test_out, 'w') as out:
            p = 0

            for line in test_in:
                data = json.loads(line)
                if self.is_empty(data):
                    continue

                p += 1

                word2eid = defaultdict(list)

                labels = []
                l_e = []

                index = 0
                for event in data[self.spot_field][body_field]:
                    word2eid[event['surface']].append(index)
                    labels.append(event['salience'])
                    event_id = self.h_event_id.get(self.get_event_head(event),
                                                   0)
                    l_e.append(event_id)

                text = data[body_field]
                parser = PlaintextParser.from_string(text,
                                                     Tokenizer('english'))

                predicted = {}

                rank = 1
                for sentence in self.summarizer(parser.document, 10):
                    for word in sentence.words:
                        if word in word2eid:
                            eids = word2eid[word]
                            if word not in predicted:
                                predicted[word] = (eids, rank)
                                rank += 1

                prediction = [0] * len(labels)
                for w, (eids, rank) in predicted.items():
                    for eid in eids:
                        prediction[eid] = 1.0 / rank

                eva = self.evaluator.evaluate(prediction, labels)

                h_out = {
                    'docno': data['docno'],
                    body_field: {
                        'predict': zip(l_e, prediction),
                    },
                    'eval': eva,
                }

                h_total_eva = add_svm_feature(h_total_eva, eva)
                h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)

                print >> out, json.dumps(h_out)

                if not p % 1000:
                    logging.info('predicted [%d] docs, eva %s', p,
                                 json.dumps(h_mean_eva))
Exemplo n.º 7
0
    def evaluate_json_joint(self, docs, f_predict):
        print("Evaluating joint predictions [%s] from [%s]." %
              (f_predict, docs))

        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        h_e_mean_eva = dict()

        h_evm_total_eva = dict()
        h_evm_mean_eva = dict()

        h_all_total_eva = dict()
        h_all_mean_eva = dict()

        e_p = 0
        evm_p = 0
        all_p = 0
        p = 0

        for res in self.load_pairs(docs, f_predict):
            p += 1

            if not res:
                continue

            predictions, s_e_label, s_evm_label = res

            l_e_pack = self.get_e_labels(predictions, s_e_label)
            l_evm_pack = self.get_evm_labels(predictions, s_evm_label)
            all_pack = zip(*zip(*l_e_pack) + zip(*l_evm_pack))

            if l_e_pack:
                h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                e_p += 1
                h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

            if l_evm_pack:
                h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1])
                evm_p += 1
                h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm)

            if all_pack:
                h_all = evaluator.evaluate(all_pack[0], all_pack[1])
                all_p += 1
                h_all_total_eva = add_svm_feature(h_all_total_eva, h_all)

            if not e_p == 0:
                h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            if not evm_p == 0:
                h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva,
                                                     1.0 / evm_p)
            if not all_p == 0:
                h_all_mean_eva = mutiply_svm_feature(h_all_total_eva,
                                                     1.0 / all_p)

            ep1 = '%.4f' % h_e_mean_eva[
                'p@01'] if 'p@01' in h_e_mean_eva else 'N/A'
            evmp1 = '%.4f' % h_evm_mean_eva[
                'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A'
            all1 = '%.4f' % h_all_mean_eva[
                'p@01'] if 'p@01' in h_all_mean_eva else 'N/A'

            sys.stdout.write('\rEvaluated %d files, %d with entities and %d '
                             'with events, En P@1: %s, Evm P@1: %s, '
                             'All P@1: %s.' %
                             (p, e_p, evm_p, ep1, evmp1, all1))

        print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        h_evm_mean_eva = {}
        if not evm_p == 0:
            h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p)
            logging.info('finished predicted [%d] docs on event, eva %s',
                         evm_p, json.dumps(h_evm_mean_eva))

        logging.info("Results to copy:")
        line1 = ["p@01", "p@05", "p@10", "p@20", "auc"]
        line2 = ["r@01", "r@05", "r@10", "r@20"]

        line1_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line1]
        line1_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line1]
        line1_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line1]

        line2_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line2]
        line2_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line2]
        line2_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line2]

        print "\t-\t".join(line1_evm_scores) + "\t-\t-\t" + \
              "\t".join(line1_all_scores) + "\t-\t" + \
              "\t".join(line1_ent_scores)

        print "\t-\t".join(line2_evm_scores) + "\t-\t-\t-\t-\t" + \
              "\t".join(line2_all_scores) + "\t-\t-\t" + \
              "\t".join(line2_ent_scores)

        res = {'entity': h_e_mean_eva, 'event': h_evm_mean_eva}

        with open(f_predict + '.joint.eval', 'w') as out:
            json.dump(res, out, indent=1)
Exemplo n.º 8
0
class FeatureBasedBaseline(Configurable):
    event_model = Bool(False, help='Run event model').tag(config=True)
    feature_names = Unicode(
        "", help="Comma seperated name of features").tag(config=True)
    reverse_feature = Unicode("",
                              help="List the features that should be "
                              "ranked reversely").tag(config=True)
    corpus_in = Unicode(help='input').tag(config=True)
    test_out = Unicode(help='output').tag(config=True)

    in_field = Unicode(body_field)
    salience_field = Unicode(abstract_field)
    spot_field = Unicode('spot')

    # A specific field is reserved to mark the salience answer.
    salience_gold = Unicode(salience_gold)

    def __init__(self, **kwargs):
        super(FeatureBasedBaseline, self).__init__(**kwargs)
        if self.event_model:
            self.spot_field = 'event'

        self.io = EventDataIO(**kwargs)

        self.evaluator = SalienceEva(**kwargs)
        self.feature_names_split = self.feature_names.split(",")
        self.feature_dim = len(self.feature_names_split)

        reverse_f = set(self.reverse_feature.split(","))

        # Mask to identify which features should be ranked reversely.
        self.reverse_dim = []
        for i, n in enumerate(self.feature_names_split):
            self.reverse_dim.append(n in reverse_f)

        if self.feature_dim == 0:
            logging.error("You must provide feature names.")
        else:
            logging.info("Number of features to check: %d" % self.feature_dim)

    def eval_per_dim(self, h_packed_data, m_label, reverse_dim, key_name,
                     docno):
        if use_cuda:
            feature_data = h_packed_data['ts_feature'].data.cpu()
            label_data = m_label.data.cpu()
        else:
            feature_data = h_packed_data['ts_feature'].data
            label_data = m_label.data

        features = np.squeeze(feature_data.numpy(), axis=0)
        labels = np.squeeze(label_data.numpy(), axis=0).tolist()

        num_features = features.shape[1]
        l_h_out = [dict() for _ in range(num_features)]

        for f_dim in range(num_features):
            values = features[:, f_dim].tolist()
            l_h_out[f_dim][key_name] = docno
            mtx_e = h_packed_data['mtx_e']

            if use_cuda:
                l_e = mtx_e[0].cpu().data.numpy().tolist()
            else:
                l_e = mtx_e[0].data.numpy().tolist()

            if reverse_dim[f_dim]:
                values = [0 - v for v in values]
            l_h_out[f_dim][body_field] = {'predict': zip(l_e, values)}
            l_h_out[f_dim]['eval'] = self.evaluator.evaluate(values, labels)

        return l_h_out

    def process(self):
        open_func = gzip.open if self.corpus_in.endswith("gz") else open

        outs = []
        for name in self.feature_names_split:
            out_path = self.test_out + "_" + name.replace(" ", "_") + '.json'
            outs.append(open(out_path, 'w'))
            logging.info("Feature output will be stored at [%s]" % out_path)

        with open_func(self.corpus_in) as in_f:
            l_h_total_eva = [{} for _ in range(self.feature_dim)]
            p = 0
            for line in in_f:
                if self.io.is_empty_line(line):
                    continue

                # Instead of providing batch, we just give one by one.
                h_packed_data, m_label = self.io.parse_data([line])

                h_info = json.loads(line)
                key_name = 'docno'
                docno = h_info[key_name]

                p += 1
                l_h_out = self.eval_per_dim(h_packed_data, m_label,
                                            self.reverse_dim, key_name, docno)

                for (dim, h_out), out in zip(enumerate(l_h_out), outs):
                    h_this_eva = h_out['eval']
                    l_h_total_eva[dim] = add_svm_feature(
                        l_h_total_eva[dim], h_this_eva)
                    h_mean_eva = mutiply_svm_feature(l_h_total_eva[dim],
                                                     1.0 / p)

                    print >> out, json.dumps(h_out)

                    if not p % 1000:
                        logging.info('predicted [%d] docs, eva %s for [%s]', p,
                                     json.dumps(h_mean_eva),
                                     self.feature_names_split[dim])

            for dim, h_total_eva in enumerate(l_h_total_eva):
                h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
                logging.info('finished predicted [%d] docs, eva %s for [%s]',
                             p, json.dumps(h_mean_eva),
                             self.feature_names_split[dim])

        for (dim, h_total_eva), name in zip(enumerate(l_h_total_eva),
                                            self.feature_names_split):
            h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
            l_mean_eva = sorted(h_mean_eva.items(), key=lambda item: item[0])

            logging.info('finished predicted [%d] docs, eva %s', p,
                         json.dumps(l_mean_eva))

            with open(self.test_out + "_" + name.replace(" ", "_") + '.eval',
                      'w') as o:
                json.dump(l_mean_eva, o, indent=1)

        for out in outs:
            out.close()
Exemplo n.º 9
0
    def split_and_eval(self, docs, f_predict):
        print("Split and evaluating joint predictions [%s]." % f_predict)

        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        h_e_mean_eva = dict()

        h_evm_total_eva = dict()
        h_evm_mean_eva = dict()

        e_p = 0
        evm_p = 0
        p = 0

        with open(f_predict + '.entity.json', 'w') as entity_out, \
                open(f_predict + '.event.json', 'w') as event_out:
            for res in self.load_pairs(docs, f_predict):
                p += 1

                if not res:
                    continue

                doc, predictions, s_e_label, s_evm_label = res

                l_e_pack = self.get_e_labels(predictions, s_e_label)
                l_evm_pack = self.get_evm_labels(predictions, s_evm_label)

                pred_event = {'bodyText': {}}
                pred_entity = {'bodyText': {}}

                if l_e_pack:
                    h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                    e_p += 1
                    h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

                    pred_entity['bodyText']['predict'] = [[
                        eid, score
                    ] for eid, score in zip(l_e_pack[2], l_e_pack[0])]
                    pred_entity['docno'] = doc['docno']
                    pred_entity['eval'] = h_e

                    entity_out.write(json.dumps(pred_entity))
                    entity_out.write('\n')

                if l_evm_pack:
                    h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1])
                    evm_p += 1
                    h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm)

                    pred_event['bodyText']['predict'] = [[
                        eid, score
                    ] for eid, score in zip(l_evm_pack[2], l_evm_pack[0])]
                    pred_event['docno'] = doc['docno']
                    pred_event['eval'] = h_evm

                    event_out.write(json.dumps(pred_event))
                    event_out.write('\n')

                if not e_p == 0:
                    h_e_mean_eva = mutiply_svm_feature(h_e_total_eva,
                                                       1.0 / e_p)
                if not evm_p == 0:
                    h_evm_mean_eva = mutiply_svm_feature(
                        h_evm_total_eva, 1.0 / evm_p)

                ep1 = '%.4f' % h_e_mean_eva[
                    'p@01'] if 'p@01' in h_e_mean_eva else 'N/A'
                evmp1 = '%.4f' % h_evm_mean_eva[
                    'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A'

                sys.stdout.write(
                    '\rEvaluated %d files, %d with entities and %d '
                    'with events, En P@1: %s, Evm P@1: %s, ' %
                    (p, e_p, evm_p, ep1, evmp1))

            print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        h_evm_mean_eva = {}
        if not evm_p == 0:
            h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p)
            logging.info('finished predicted [%d] docs on event, eva %s',
                         evm_p, json.dumps(h_evm_mean_eva))

        with open(f_predict + '.entity.eval', 'w') as out:
            json.dump([[k, v] for k, v in h_e_mean_eva.items()], out, indent=1)

        with open(f_predict + '.event.eval', 'w') as out:
            json.dump([[k, v] for k, v in h_evm_mean_eva.items()],
                      out,
                      indent=1)
Exemplo n.º 10
0
class SalienceModelCenter(Configurable):
    learning_rate = Float(1e-3, help='learning rate').tag(config=True)
    model_name = Unicode(help="model name: trans").tag(config=True)
    nb_epochs = Int(2, help='nb of epochs').tag(config=True)
    l_class_weights = List(Float, default_value=[1, 10]).tag(config=True)
    batch_size = Int(128,
                     help='number of documents per batch').tag(config=True)
    loss_func = Unicode(
        'hinge', help='loss function to use: hinge, pairwise').tag(config=True)
    early_stopping_patient = Int(
        5, help='epochs before early stopping').tag(config=True)
    early_stopping_frequency = Int(
        100000000,
        help='the nb of data points to check dev loss').tag(config=True)
    max_e_per_doc = Int(200, help='max e per doc')

    # The following 3 configs should be deprecated with the old io.
    # event_model = Bool(False, help='Run event model').tag(config=True)
    joint_model = Bool(False, help='Run joint model').tag(config=True)
    # input_format = Unicode(help='overwrite input format: raw | featured').tag(
    #     config=True)
    # The above 3 configs should be deprecated with the old io.

    use_new_io = Bool(True,
                      help='whether use the new IO format').tag(config=True)
    predict_with_intermediate_res = Bool(
        False, help='whether to kee intermediate results').tag(config=True)

    h_model = {
        'frequency': FrequencySalience,
        'feature_lr': FeatureLR,
        "trans": EmbPageRank,
        'knrm': KNRM,
        'linear_kcrf': LinearKernelCRF,
        'gloss_cnn': GlossCNNKNRM,
        'nlss_cnn': NlssCnnKnrm,
        'duet_knrm': DuetKNRM,
        'duet_gloss': DuetGlossCNN,
        'gloss_enriched_duet': GlossCNNEmbDuet,
        'adj_knrm': AdjKNRM,
        'kcrf_event_average': AverageEventKernelCRF,
        'kcrf_args_average': AverageArgumentKernelCRF,
        "avg_local_vote": LocalAvgWordVotes,  # not working
        'local_rnn': LocalRNNVotes,  # not working
        'local_max_rnn': LocalRNNMaxSim,  # not working
        'EdgeCNN': EdgeCNN,  # not working
        'lr': EmbeddingLR,  # not working
        'kcrf': KernelCRF,  # not working
    }

    h_model_io = {
        'frequency': raw_io,
        'feature_lr': feature_io,
        'knrm': raw_io,
        'linear_kcrf': feature_io,
        'gloss_cnn': raw_io,
        'nlss_cnn': raw_io,
        'word_knrm': duet_io,
        'duet_knrm': duet_io,
        'duet_gloss': duet_io,
        'gloss_enriched_duet': duet_io,
        'adj_knrm': adj_edge_io,
        "avg_local_vote": uw_io,  # not working
        'local_rnn': uw_io,  # not working
        'local_max_rnn': uw_io,  # not working
        'lr': feature_io,  # not working
    }

    # in_field = Unicode(body_field)
    spot_field = Unicode('spot')
    event_spot_field = Unicode('event')
    abstract_field = Unicode('abstract')
    # A specific field is reserved to mark the salience answer.
    salience_gold = Unicode(salience_gold)

    def __init__(self, **kwargs):
        super(SalienceModelCenter, self).__init__(**kwargs)
        self.para = NNPara(**kwargs)
        self.ext_data = ExtData(**kwargs)
        self.ext_data.assert_with_para(self.para)
        self._setup_io(**kwargs)
        h_loss = {
            "hinge": hinge_loss,  # hinge classification loss does not work
            "pairwise": pairwise_loss,
        }
        self.criterion = h_loss[self.loss_func]
        self.class_weight = torch.cuda.FloatTensor(self.l_class_weights)

        # if self.event_model and self.joint_model:
        #     logging.error("Please specify one mode only.")
        #     exit(1)

        self.evaluator = SalienceEva(**kwargs)
        self._init_model()

        self.patient_cnt = 0
        self.best_valid_loss = 0
        self.ll_valid_line = []

    def _setup_io(self, **kwargs):
        self.io_parser = DataIO(**kwargs)

    @classmethod
    def class_print_help(cls, inst=None):
        super(SalienceModelCenter, cls).class_print_help(inst)
        NNPara.class_print_help(inst)
        ExtData.class_print_help(inst)
        SalienceEva.class_print_help(inst)
        DataIO.class_print_help(inst)

    def _init_model(self):
        if self.model_name:
            if self.joint_model:
                self._merge_para()
            self.model = self.h_model[self.model_name](self.para,
                                                       self.ext_data)
            logging.info('use model [%s]', self.model_name)

    def _merge_para(self):
        """
        Merge the parameter of entity and event embedding, including the vocab
        size.
        :return:
        """
        self.ext_data.entity_emb = np.concatenate(
            (self.ext_data.entity_emb, self.ext_data.event_emb))
        self.para.entity_vocab_size = self.para.entity_vocab_size + \
                                      self.para.event_vocab_size

        assert self.para.node_feature_dim == self.io_parser.e_feature_dim + \
               self.io_parser.evm_feature_dim

        logging.info("Embedding matrix merged into shape [%d,%d]" %
                     (self.ext_data.entity_emb.shape[0],
                      self.ext_data.entity_emb.shape[1]))

    def train(self,
              train_in_name,
              validation_in_name=None,
              model_out_name=None):
        """
        train using the given data
        will use each doc as the mini-batch for now
        :param train_in_name: training data
        :param validation_in_name: validation data
        :param model_out_name: name to dump the model
        :return: keep the model
        """
        logging.info('training with data in [%s]', train_in_name)
        self.model.train()

        if not model_out_name:
            model_out_name = train_in_name + '.model_%s' % self.model_name

        logging.info('Model out name is [%s]', model_out_name)

        model_dir = os.path.dirname(model_out_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        if validation_in_name:
            self._init_early_stopper(validation_in_name)

        optimizer = torch.optim.Adam(filter(
            lambda model_para: model_para.requires_grad,
            self.model.parameters()),
                                     lr=self.learning_rate)
        l_epoch_loss = []
        for epoch in xrange(self.nb_epochs):
            self._epoch_start()

            p = 0
            total_loss = 0
            data_cnt = 0
            logging.info('start epoch [%d]', epoch)
            l_this_batch_line = []
            es_cnt = 0
            es_flag = False
            for line in open(train_in_name):
                if self.io_parser.is_empty_line(line):
                    continue
                data_cnt += 1
                es_cnt += 1
                l_this_batch_line.append(line)
                if len(l_this_batch_line) >= self.batch_size:
                    this_loss = self._batch_train(l_this_batch_line,
                                                  self.criterion, optimizer)
                    p += 1
                    total_loss += this_loss
                    logging.debug('[%d] batch [%f] loss', p, this_loss)
                    assert not math.isnan(this_loss)
                    if not p % 100:
                        logging.info('batch [%d] [%d] data, average loss [%f]',
                                     p, data_cnt, total_loss / p)
                        self._train_info()
                    l_this_batch_line = []
                    if es_cnt >= self.early_stopping_frequency:
                        logging.info(
                            'checking dev loss at [%d]-[%d] vs frequency [%d]',
                            epoch, es_cnt, self.early_stopping_frequency)
                        es_cnt = 0
                        if validation_in_name:
                            self.model.eval()
                            if self._early_stop(model_out_name):
                                logging.info(
                                    'early stopped at [%d] epoch [%d] data',
                                    epoch, data_cnt)
                                es_flag = True
                                break
                            self.model.train()
            if es_flag:
                break

            if l_this_batch_line:
                this_loss = self._batch_train(l_this_batch_line,
                                              self.criterion, optimizer)
                p += 1
                total_loss += this_loss
                logging.debug('[%d] batch [%f] loss', p, this_loss)
                assert not math.isnan(this_loss)
                l_this_batch_line = []

            logging.info(
                'epoch [%d] finished with loss [%f] on [%d] batch [%d] doc',
                epoch, total_loss / p, p, data_cnt)
            l_epoch_loss.append(total_loss / p)

            self._train_info()

            # validation
            if validation_in_name:
                self.model.eval()
                if self._early_stop(model_out_name):
                    logging.info('early stopped at [%d] epoch', epoch)
                    break
                self.model.train()

        logging.info('[%d] epoch done with loss %s', self.nb_epochs,
                     json.dumps(l_epoch_loss))

        if model_out_name:
            # self.model.save_model(model_out_name)
            logging.info('Torch saving model to [%s]', model_out_name)
            torch.save(self.model, model_out_name)
        return

    def _train_info(self):
        pass

    def _epoch_start(self):
        pass

    def _epoch_end(self):
        pass

    def _init_early_stopper(self, validation_in_name):
        self.patient_cnt = 0
        self.best_valid_loss = None
        self.ll_valid_line = []
        logging.info('loading validation data from [%s]', validation_in_name)
        l_valid_lines = [
            l for l in open(validation_in_name).read().splitlines()
            if not self.io_parser.is_empty_line(l)
        ]
        self.ll_valid_line = [
            l_valid_lines[i:i + self.batch_size]
            for i in xrange(0, len(l_valid_lines), self.batch_size)
        ]
        logging.info('validation with [%d] doc', len(l_valid_lines))
        self.best_valid_loss = sum([
            self._batch_test(l_one_batch) for l_one_batch in self.ll_valid_line
        ]) / float(len(self.ll_valid_line))
        logging.info('initial validation loss [%.4f]', self.best_valid_loss)

    def _early_stop(self, model_out_name):
        this_valid_loss = sum([
            self._batch_test(l_one_batch) for l_one_batch in self.ll_valid_line
        ]) / float(len(self.ll_valid_line))
        logging.info('valid loss [%f]', this_valid_loss)
        if self.best_valid_loss is None:
            self.best_valid_loss = this_valid_loss
            logging.info('init valid loss with [%f]', this_valid_loss)
            if model_out_name:
                logging.info('save init model to [%s]', model_out_name)
                torch.save(self.model, model_out_name)
                logging.info('model kept')
        elif this_valid_loss > self.best_valid_loss:
            self.patient_cnt += 1
            logging.info('valid loss increased [%.4f -> %.4f][%d]',
                         self.best_valid_loss, this_valid_loss,
                         self.patient_cnt)
            if self.patient_cnt >= self.early_stopping_patient:
                logging.info('early stopped after patient [%d]',
                             self.patient_cnt)
                logging.info('loading best model [%s] with loss [%f]',
                             model_out_name, self.best_valid_loss)
                self.model = torch.load(model_out_name)
                return True
        else:
            self.patient_cnt = 0
            logging.info('valid loss decreased [%.4f -> %.4f][%d]',
                         self.best_valid_loss, this_valid_loss,
                         self.patient_cnt)
            if model_out_name:
                logging.info('update best model at [%s]', model_out_name)
                torch.save(self.model, model_out_name)
                logging.info('model kept')

            self.best_valid_loss = this_valid_loss
        return False

    def load_model(self, model_out_name):
        logging.info('loading trained model from [%s]', model_out_name)
        self.model = torch.load(model_out_name)

    def _batch_train(self, l_line, criterion, optimizer):
        h_packed_data, m_label = self._data_io(l_line)
        optimizer.zero_grad()
        output = self.model(h_packed_data)
        loss = criterion(output, m_label)
        loss.backward()
        optimizer.step()
        assert not math.isnan(loss.data[0])
        return loss.data[0]

    def predict(self, test_in_name, label_out_name, debug=False):
        """
        predict the data in test_in,
        dump predict labels in label_out_name
        :param test_in_name:
        :param label_out_name:
        :param debug:
        :return:
        """
        res_dir = os.path.dirname(label_out_name)
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)

        self.model.debug_mode(debug)
        self.model.eval()

        out = open(label_out_name, 'w')
        logging.info('start predicting for [%s]', test_in_name)
        p = 0
        h_total_eva = dict()
        for line in open(test_in_name):
            if self.io_parser.is_empty_line(line):
                continue
            h_out, h_this_eva = self._per_doc_predict(line)
            if h_out is None:
                continue
            h_total_eva = add_svm_feature(h_total_eva, h_this_eva)
            print >> out, json.dumps(h_out)
            p += 1
            h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
            if not p % 1000:
                logging.info('predicted [%d] docs, eva %s', p,
                             json.dumps(h_mean_eva))
        h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / max(p, 1.0))
        l_mean_eva = h_mean_eva.items()
        l_mean_eva.sort(key=lambda item: item[0])
        logging.info('finished predicted [%d] docs, eva %s', p,
                     json.dumps(l_mean_eva))
        json.dump(l_mean_eva, open(label_out_name + '.eval', 'w'), indent=1)
        out.close()
        return

    def _per_doc_predict(self, line):
        h_info = json.loads(line)
        key_name = 'docno'
        if key_name not in h_info:
            key_name = 'qid'
            assert key_name in h_info
        docno = h_info[key_name]
        h_packed_data, v_label = self._data_io([line])
        v_e = h_packed_data['mtx_e']
        # v_w = h_packed_data['mtx_score']
        if (not v_e[0].size()) | (not v_label[0].size()):
            return None, None
        output = self.model(h_packed_data).cpu()[0]
        v_e = v_e[0].cpu()

        pre_label = output.data.sign().type(torch.LongTensor)
        l_score = output.data.numpy().tolist()
        h_out = dict()
        h_out[key_name] = docno
        l_e = v_e.data.numpy().tolist()
        h_out[self.io_parser.content_field] = {'predict': zip(l_e, l_score)}

        if self.predict_with_intermediate_res:
            middle_output = \
                self.model.forward_intermediate(h_packed_data).cpu()[0]
            l_middle_features = middle_output.data.numpy().tolist()
            h_out[self.io_parser.content_field]['predict_features'] = zip(
                l_e, l_middle_features)

        v_label = v_label[0].cpu()
        y = v_label.data.view_as(pre_label)
        l_label = y.numpy().tolist()
        h_this_eva = self.evaluator.evaluate(l_score, l_label)
        h_out['eval'] = h_this_eva
        return h_out, h_this_eva

    def _batch_test(self, l_lines):
        h_packed_data, m_label = self._data_io(l_lines)
        output = self.model(h_packed_data)
        loss = self.criterion(output, m_label)
        return loss.data[0]

    def _data_io(self, l_line):
        if self.use_new_io:
            return self.model.data_io(l_line, self.io_parser)
        else:
            return self._old_io(l_line)

    def _old_io(self, l_line):
        return self.h_model_io[self.model_name](
            l_line, self.para.node_feature_dim, self.spot_field, self.in_field,
            self.abstract_field, self.salience_gold, self.max_e_per_doc)
Exemplo n.º 11
0
 def class_print_help(cls, inst=None):
     super(SalienceModelCenter, cls).class_print_help(inst)
     NNPara.class_print_help(inst)
     ExtData.class_print_help(inst)
     SalienceEva.class_print_help(inst)
     DataIO.class_print_help(inst)