Пример #1
0
def get_feature_avg(svm_in):
    l_svm_data = load_svm_feature(svm_in)
    h_rel_feature = {}
    h_irrel_feature = {}
    rel_cnt = 0
    irrel_cnt = 0

    for data in l_svm_data:
        label = data['score']
        h_feature = data['feature']
        for key, score in h_feature.items():
            if score < -20:
                score = 0
            else:
                score = math.exp(score)
            h_feature[key] = score
        if label > 0:
            h_rel_feature = add_svm_feature(h_rel_feature, h_feature)
            rel_cnt += 1
        else:
            h_irrel_feature = add_svm_feature(h_irrel_feature, h_feature)
            irrel_cnt += 1

    rel_cnt = float(rel_cnt)
    irrel_cnt = float(irrel_cnt)
    for key in h_rel_feature:
        h_rel_feature[key] /= rel_cnt
    for key in h_irrel_feature:
        h_irrel_feature[key] /= irrel_cnt
    return h_rel_feature, h_irrel_feature
Пример #2
0
    def evaluate_normal(self, docs, f_predict):
        print("Evaluating predictions [%s] from [%s]." % (f_predict, docs))
        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        e_p = 0
        p = 0
        skip = 0

        for res in self.load_pairs(docs, f_predict):
            p += 1
            if not res:
                e_p += 1
            else:
                skip += 1
                continue

            predictions, s_e_label, s_evm_label = res

            l_e_pack = self.get_e_labels(predictions, s_e_label)

            if l_e_pack:
                h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

            if not e_p == 0:
                h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
                sys.stdout.write('\rEvaluated %d files, %d with entities,'
                                 ' %d line skipped. P@1: %s.' %
                                 (p, e_p, skip, h_e_mean_eva['p@01']))

        print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        res = {'entity': h_e_mean_eva}

        with open(f_predict + '.entity.eval', 'w') as out:
            json.dump(res, out, indent=1)
Пример #3
0
    def predict(self, test_in_name, label_out_name, debug=False):
        """
        predict the data in test_in,
        dump predict labels in label_out_name
        :param test_in_name:
        :param label_out_name:
        :param debug:
        :return:
        """
        res_dir = os.path.dirname(label_out_name)
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)

        self.model.debug_mode(debug)
        self.model.eval()

        out = open(label_out_name, 'w')
        logging.info('start predicting for [%s]', test_in_name)
        p = 0
        h_total_eva = dict()
        for line in open(test_in_name):
            if self.io_parser.is_empty_line(line):
                continue
            h_out, h_this_eva = self._per_doc_predict(line)
            if h_out is None:
                continue
            h_total_eva = add_svm_feature(h_total_eva, h_this_eva)
            print >> out, json.dumps(h_out)
            p += 1
            h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
            if not p % 1000:
                logging.info('predicted [%d] docs, eva %s', p,
                             json.dumps(h_mean_eva))
        h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / max(p, 1.0))
        l_mean_eva = h_mean_eva.items()
        l_mean_eva.sort(key=lambda item: item[0])
        logging.info('finished predicted [%d] docs, eva %s', p,
                     json.dumps(l_mean_eva))
        json.dump(l_mean_eva, open(label_out_name + '.eval', 'w'), indent=1)
        out.close()
        return
Пример #4
0
    def process(self):
        h_total_eva = {}
        with gzip.open(self.corpus_in) as test_in, \
                open(self.test_out, 'w') as out:
            p = 0

            for line in test_in:
                data = json.loads(line)
                if self.is_empty(data):
                    continue

                p += 1

                word2eid = defaultdict(list)

                labels = []
                l_e = []

                index = 0
                for event in data[self.spot_field][body_field]:
                    word2eid[event['surface']].append(index)
                    labels.append(event['salience'])
                    event_id = self.h_event_id.get(self.get_event_head(event),
                                                   0)
                    l_e.append(event_id)

                text = data[body_field]
                parser = PlaintextParser.from_string(text,
                                                     Tokenizer('english'))

                predicted = {}

                rank = 1
                for sentence in self.summarizer(parser.document, 10):
                    for word in sentence.words:
                        if word in word2eid:
                            eids = word2eid[word]
                            if word not in predicted:
                                predicted[word] = (eids, rank)
                                rank += 1

                prediction = [0] * len(labels)
                for w, (eids, rank) in predicted.items():
                    for eid in eids:
                        prediction[eid] = 1.0 / rank

                eva = self.evaluator.evaluate(prediction, labels)

                h_out = {
                    'docno': data['docno'],
                    body_field: {
                        'predict': zip(l_e, prediction),
                    },
                    'eval': eva,
                }

                h_total_eva = add_svm_feature(h_total_eva, eva)
                h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)

                print >> out, json.dumps(h_out)

                if not p % 1000:
                    logging.info('predicted [%d] docs, eva %s', p,
                                 json.dumps(h_mean_eva))
Пример #5
0
    def evaluate_json_joint(self, docs, f_predict):
        print("Evaluating joint predictions [%s] from [%s]." %
              (f_predict, docs))

        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        h_e_mean_eva = dict()

        h_evm_total_eva = dict()
        h_evm_mean_eva = dict()

        h_all_total_eva = dict()
        h_all_mean_eva = dict()

        e_p = 0
        evm_p = 0
        all_p = 0
        p = 0

        for res in self.load_pairs(docs, f_predict):
            p += 1

            if not res:
                continue

            predictions, s_e_label, s_evm_label = res

            l_e_pack = self.get_e_labels(predictions, s_e_label)
            l_evm_pack = self.get_evm_labels(predictions, s_evm_label)
            all_pack = zip(*zip(*l_e_pack) + zip(*l_evm_pack))

            if l_e_pack:
                h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                e_p += 1
                h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

            if l_evm_pack:
                h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1])
                evm_p += 1
                h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm)

            if all_pack:
                h_all = evaluator.evaluate(all_pack[0], all_pack[1])
                all_p += 1
                h_all_total_eva = add_svm_feature(h_all_total_eva, h_all)

            if not e_p == 0:
                h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            if not evm_p == 0:
                h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva,
                                                     1.0 / evm_p)
            if not all_p == 0:
                h_all_mean_eva = mutiply_svm_feature(h_all_total_eva,
                                                     1.0 / all_p)

            ep1 = '%.4f' % h_e_mean_eva[
                'p@01'] if 'p@01' in h_e_mean_eva else 'N/A'
            evmp1 = '%.4f' % h_evm_mean_eva[
                'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A'
            all1 = '%.4f' % h_all_mean_eva[
                'p@01'] if 'p@01' in h_all_mean_eva else 'N/A'

            sys.stdout.write('\rEvaluated %d files, %d with entities and %d '
                             'with events, En P@1: %s, Evm P@1: %s, '
                             'All P@1: %s.' %
                             (p, e_p, evm_p, ep1, evmp1, all1))

        print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        h_evm_mean_eva = {}
        if not evm_p == 0:
            h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p)
            logging.info('finished predicted [%d] docs on event, eva %s',
                         evm_p, json.dumps(h_evm_mean_eva))

        logging.info("Results to copy:")
        line1 = ["p@01", "p@05", "p@10", "p@20", "auc"]
        line2 = ["r@01", "r@05", "r@10", "r@20"]

        line1_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line1]
        line1_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line1]
        line1_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line1]

        line2_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line2]
        line2_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line2]
        line2_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line2]

        print "\t-\t".join(line1_evm_scores) + "\t-\t-\t" + \
              "\t".join(line1_all_scores) + "\t-\t" + \
              "\t".join(line1_ent_scores)

        print "\t-\t".join(line2_evm_scores) + "\t-\t-\t-\t-\t" + \
              "\t".join(line2_all_scores) + "\t-\t-\t" + \
              "\t".join(line2_ent_scores)

        res = {'entity': h_e_mean_eva, 'event': h_evm_mean_eva}

        with open(f_predict + '.joint.eval', 'w') as out:
            json.dump(res, out, indent=1)
Пример #6
0
    def process(self):
        open_func = gzip.open if self.corpus_in.endswith("gz") else open

        outs = []
        for name in self.feature_names_split:
            out_path = self.test_out + "_" + name.replace(" ", "_") + '.json'
            outs.append(open(out_path, 'w'))
            logging.info("Feature output will be stored at [%s]" % out_path)

        with open_func(self.corpus_in) as in_f:
            l_h_total_eva = [{} for _ in range(self.feature_dim)]
            p = 0
            for line in in_f:
                if self.io.is_empty_line(line):
                    continue

                # Instead of providing batch, we just give one by one.
                h_packed_data, m_label = self.io.parse_data([line])

                h_info = json.loads(line)
                key_name = 'docno'
                docno = h_info[key_name]

                p += 1
                l_h_out = self.eval_per_dim(h_packed_data, m_label,
                                            self.reverse_dim, key_name, docno)

                for (dim, h_out), out in zip(enumerate(l_h_out), outs):
                    h_this_eva = h_out['eval']
                    l_h_total_eva[dim] = add_svm_feature(
                        l_h_total_eva[dim], h_this_eva)
                    h_mean_eva = mutiply_svm_feature(l_h_total_eva[dim],
                                                     1.0 / p)

                    print >> out, json.dumps(h_out)

                    if not p % 1000:
                        logging.info('predicted [%d] docs, eva %s for [%s]', p,
                                     json.dumps(h_mean_eva),
                                     self.feature_names_split[dim])

            for dim, h_total_eva in enumerate(l_h_total_eva):
                h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
                logging.info('finished predicted [%d] docs, eva %s for [%s]',
                             p, json.dumps(h_mean_eva),
                             self.feature_names_split[dim])

        for (dim, h_total_eva), name in zip(enumerate(l_h_total_eva),
                                            self.feature_names_split):
            h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
            l_mean_eva = sorted(h_mean_eva.items(), key=lambda item: item[0])

            logging.info('finished predicted [%d] docs, eva %s', p,
                         json.dumps(l_mean_eva))

            with open(self.test_out + "_" + name.replace(" ", "_") + '.eval',
                      'w') as o:
                json.dump(l_mean_eva, o, indent=1)

        for out in outs:
            out.close()
Пример #7
0
    def predict(self,
                test_in_name,
                label_out_name,
                debug=False,
                timestamp=True):
        """
        predict the data in test_in,
        dump predict labels in label_out_name
        :param test_in_name:
        :param label_out_name:
        :param debug:
        :return:
        """
        res_dir = os.path.dirname(label_out_name)
        if not os.path.exists(res_dir):
            os.makedirs(res_dir)

        self.model.debug_mode(debug)
        self.model.eval()

        name, ext = os.path.splitext(label_out_name)
        if timestamp:
            ent_label_out_name = name + "_entity_" + self.init_time + ext
            evm_label_out_name = name + "_event_" + self.init_time + ext
        else:
            ent_label_out_name = name + "_entity" + ext
            evm_label_out_name = name + "_event" + ext

        ent_out = open(ent_label_out_name, 'w')
        evm_out = open(evm_label_out_name, 'w')

        outs = [ent_out, evm_out]

        logging.info('start predicting for [%s]', test_in_name)
        logging.info('Test output will be at [%s] and [%s]',
                     ent_label_out_name, evm_label_out_name)

        p = 0
        ent_p = 0
        evm_p = 0

        h_total_ent_eva = dict()
        h_total_evm_eva = dict()
        for line in open(test_in_name):
            if self.io_parser.is_empty_line(line):
                continue
            l_h_out = self._per_doc_predict(line)

            if not l_h_out:
                continue

            for h_out, name, out in zip(l_h_out, self.output_names, outs):
                if not h_out:
                    continue

                out.write(json.dumps(h_out) + '\n')

                eva = h_out['eval']
                if name == 'entity':
                    ent_p += 1
                    h_total_ent_eva = add_svm_feature(h_total_ent_eva, eva)
                if name == 'event':
                    evm_p += 1
                    h_total_evm_eva = add_svm_feature(h_total_evm_eva, eva)

            p += 1

            if not p % 1000:
                h_mean_ent_eva = mutiply_svm_feature(h_total_ent_eva,
                                                     1.0 / max(ent_p, 1.0))

                h_mean_evm_eva = mutiply_svm_feature(h_total_evm_eva,
                                                     1.0 / max(evm_p, 1.0))

                logging.info(
                    'predicted [%d] docs: [%d] with entities, eva %s;'
                    '[%d] with events, eva %s',
                    p,
                    ent_p,
                    json.dumps(h_mean_ent_eva),
                    evm_p,
                    json.dumps(h_mean_evm_eva),
                )

        h_mean_ent_eva = mutiply_svm_feature(h_total_ent_eva,
                                             1.0 / max(ent_p, 1.0))
        h_mean_evm_eva = mutiply_svm_feature(h_total_evm_eva,
                                             1.0 / max(evm_p, 1.0))

        l_mean_ent_eva = sorted(h_mean_ent_eva.items(),
                                key=lambda item: item[0])
        l_mean_evm_eva = sorted(h_mean_evm_eva.items(),
                                key=lambda item: item[0])

        logging.info(
            'finished predicted [%d] docs, [%d] with entities, eva %s'
            '[%d] with events, eva %s', p, ent_p, json.dumps(l_mean_ent_eva),
            evm_p, json.dumps(l_mean_evm_eva))

        self.tab_scores(h_mean_ent_eva, h_mean_evm_eva)

        json.dump(l_mean_ent_eva,
                  open(ent_label_out_name + '.eval', 'w'),
                  indent=1)

        json.dump(l_mean_evm_eva,
                  open(evm_label_out_name + '.eval', 'w'),
                  indent=1)

        ent_out.close()
        evm_out.close()
        return
Пример #8
0
    def split_and_eval(self, docs, f_predict):
        print("Split and evaluating joint predictions [%s]." % f_predict)

        evaluator = SalienceEva()  # evaluator with default values.

        h_e_total_eva = dict()
        h_e_mean_eva = dict()

        h_evm_total_eva = dict()
        h_evm_mean_eva = dict()

        e_p = 0
        evm_p = 0
        p = 0

        with open(f_predict + '.entity.json', 'w') as entity_out, \
                open(f_predict + '.event.json', 'w') as event_out:
            for res in self.load_pairs(docs, f_predict):
                p += 1

                if not res:
                    continue

                doc, predictions, s_e_label, s_evm_label = res

                l_e_pack = self.get_e_labels(predictions, s_e_label)
                l_evm_pack = self.get_evm_labels(predictions, s_evm_label)

                pred_event = {'bodyText': {}}
                pred_entity = {'bodyText': {}}

                if l_e_pack:
                    h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1])
                    e_p += 1
                    h_e_total_eva = add_svm_feature(h_e_total_eva, h_e)

                    pred_entity['bodyText']['predict'] = [[
                        eid, score
                    ] for eid, score in zip(l_e_pack[2], l_e_pack[0])]
                    pred_entity['docno'] = doc['docno']
                    pred_entity['eval'] = h_e

                    entity_out.write(json.dumps(pred_entity))
                    entity_out.write('\n')

                if l_evm_pack:
                    h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1])
                    evm_p += 1
                    h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm)

                    pred_event['bodyText']['predict'] = [[
                        eid, score
                    ] for eid, score in zip(l_evm_pack[2], l_evm_pack[0])]
                    pred_event['docno'] = doc['docno']
                    pred_event['eval'] = h_evm

                    event_out.write(json.dumps(pred_event))
                    event_out.write('\n')

                if not e_p == 0:
                    h_e_mean_eva = mutiply_svm_feature(h_e_total_eva,
                                                       1.0 / e_p)
                if not evm_p == 0:
                    h_evm_mean_eva = mutiply_svm_feature(
                        h_evm_total_eva, 1.0 / evm_p)

                ep1 = '%.4f' % h_e_mean_eva[
                    'p@01'] if 'p@01' in h_e_mean_eva else 'N/A'
                evmp1 = '%.4f' % h_evm_mean_eva[
                    'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A'

                sys.stdout.write(
                    '\rEvaluated %d files, %d with entities and %d '
                    'with events, En P@1: %s, Evm P@1: %s, ' %
                    (p, e_p, evm_p, ep1, evmp1))

            print('')

        h_e_mean_eva = {}
        if not e_p == 0:
            h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p)
            logging.info('finished predicted [%d] docs on entity, eva %s', e_p,
                         json.dumps(h_e_mean_eva))

        h_evm_mean_eva = {}
        if not evm_p == 0:
            h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p)
            logging.info('finished predicted [%d] docs on event, eva %s',
                         evm_p, json.dumps(h_evm_mean_eva))

        with open(f_predict + '.entity.eval', 'w') as out:
            json.dump([[k, v] for k, v in h_e_mean_eva.items()], out, indent=1)

        with open(f_predict + '.event.eval', 'w') as out:
            json.dump([[k, v] for k, v in h_evm_mean_eva.items()],
                      out,
                      indent=1)