예제 #1
0
    def extract_test_data(self, test_file: str):
        """
        Note: This function is deprecated, which is used for the old setting (train on 2018-train and test on 2018-test)
        This function extracts only X data for testing (assume labels are invisible for us)
        It also returns many other auxiliary things which are useful during prediction
        :param test_file:
        :return:
        """
        tweetid_list = []
        miss_tweetid = []
        tweetid2idx = dict()
        tweetid2incident = dict()
        test_x = []

        with open(test_file, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip().split('\t')
                incident_id, tweetid = line[0], line[2]
                tweetid_list.append(tweetid)
                tweetid2incident[tweetid] = incident_id

        tweetid_list = list(set(tweetid_list))  # Remove some duplicate tweets
        for tweetid in tweetid_list:
            if tweetid in self.tweetid2feature:
                tweetid2idx[tweetid] = len(test_x)
                test_x.append(self.tweetid2feature[tweetid])
            else:
                miss_tweetid.append(tweetid)

        utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format(
            len(miss_tweetid), len(tweetid_list), test_file))
        test_x = np.asarray(test_x)
        utils.print_to_log("The shape of test_x is {0}".format(test_x.shape))

        return test_x, tweetid_list, tweetid2idx, tweetid2incident
예제 #2
0
 def __init__(self, args, tweetid_list: List[str], tweet_content_list: List[dict], label2id: dict,
              tweet_id_out_file: str, test=False):
     """
     Use feature_used to control which features are used for sentence-level feature extraction.
         Currently available features:
             ['hand_crafted', 'fasttext-avg', 'fasttext-tfidf', 'glove-avg', 'glove-tfidf', 'cbnu_user_feature',
             'skip-thought', 'bert-avg/CLS-1/4/8', 'fasttext-crawl', 'fasttext-1_2M-balanced-event', 'hashtag']
     :param args:
     :param tweetid_list:
     :param tweet_content_list:
     :param label2id:
     """
     self.args = args
     self.tweetid_list = tweetid_list
     self.tweet_content_list = tweet_content_list
     self.annotated_user_type = None
     self.label2id = label2id
     self.tweet_id_out_file = tweet_id_out_file
     self.test = test
     self.train_tweet = []
     self.train_label = []
     self.tweetid2feature = dict()
     self.feature_len = None
     self.feature_collection = []
     self.feature_used = ['hand_crafted', 'fasttext-avg', 'skip-thought', 'bert-avg-1', 'bert-CLS-1',
                          'glove-tfidf', 'fasttext-crawl']
     # Convert the priority label to score which will be used to train regression model.
     self.priority2score = {'Low': 0.25, 'Medium': 0.5, 'High': 0.75, 'Critical': 1.0, 'Unknown': 0.5}
     utils.print_to_log("The feature used is {}".format(self.feature_used))
예제 #3
0
    def _extract_data_from_formalized_file_single_label(self, filename: str):
        """
        Note: This function is deprecated, because now we focus on multi-label model, and to make it consistent
            with the official evaluation file, we need our ground truth label in the form of multi-label
        Notice that each tweet may have several labels, and we use each of them to construct a training instance
        :param filename: The filename of formalized file, where each line is "{tweetid}\t{labels}\t{priority}}"
        :return:
        """
        count_miss = 0
        count_total = 0
        data_x, data_y = [], []
        with open(filename, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip().split('\t')
                tweetid = line[0]
                categories = line[1].split(',')
                count_total += 1
                if tweetid in self.tweetid2feature:
                    feature = self.tweetid2feature[tweetid]
                    for tweet_label in categories:
                        if tweet_label not in self.label2id:
                            continue
                        data_x.append(feature)
                        data_y.append(self.label2id[tweet_label])
                else:
                    count_miss += 1

        utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format(count_miss, count_total, filename))
        data_x, data_y = np.asarray(data_x), np.asarray(data_y, dtype=np.int32)
        print("The shape of data_x is {0}, shape of data_y is {1}".format(data_x.shape, data_y.shape))
        return data_x, data_y
예제 #4
0
파일: train.py 프로젝트: berniebear/trec_is
    def _simple_cross_validate(self):
        """
        Use a simple fixed NB model to double check the correctness of sklearn Random search and my random search
        It can confirm our API compatible with late-fusion is correct
        :return:
        """
        kf = KFold(n_splits=self.args.cv_num,
                   random_state=self.args.random_seed)
        metric_values = {metric_name: [] for metric_name in self.metric_names}
        clf = BernoulliNB(alpha=0.8490, binarize=0.3086, fit_prior=True)
        clf = OneVsRestClassifier(clf, n_jobs=self.args.n_jobs)
        for train_idx_list, test_idx_list in kf.split(self.data_x,
                                                      self.data_y):
            X_train = self.data_x[train_idx_list]
            y_train = self.data_y[train_idx_list]
            X_test = self.data_x[test_idx_list]
            y_test = self.data_y[test_idx_list]
            clf.fit(X_train, y_train)
            y_predict_score = clf.predict_proba(X_test)
            y_predict = np.argmax(y_predict_score, axis=-1)
            metric_results = utils.evaluate_any_type(y_test, y_predict,
                                                     self.id2label)
            for metric_name in self.metric_names:
                metric_values[metric_name].append(
                    [metric_results[metric_name],
                     len(y_test)])

        metric_weighted_avg = self._get_weighted_avg(metric_values)
        for metric_name in ['f1']:
            print_to_log('The {0} score in cross validation is {1}'.format(
                metric_name, metric_values[metric_name]))
            print_to_log('The average {0} score is {1}'.format(
                metric_name, metric_weighted_avg[metric_name]))
        quit()
예제 #5
0
    def _extract_data_from_formalized_file_v1(self, filename: str):
        """
        This function is deprecated
        :param filename:
        :return:
        """
        count_miss = 0
        count_total = 0

        if self.args.event_wise:
            data_x = {event_type: [] for event_type in utils.idx2event_type}
            data_y = {event_type: [] for event_type in utils.idx2event_type}
            event2idx_list = {event_type: [] for event_type in utils.idx2event_type}
        else:
            data_x, data_y = [], []

        with open(filename, 'r', encoding='utf8') as f:
            for idx, line in enumerate(f):
                line = line.strip().split('\t')
                tweetid = line[0]
                event_type = line[3]
                # The 2018train + 2018test data will not filter out any label
                categories = [self.label2id[label] for label in line[1].split(',')]
                count_total += 1
                if tweetid in self.tweetid2feature:
                    feature = self.tweetid2feature[tweetid]
                    if self.args.event_wise:
                        data_x[event_type].append(feature)
                        data_y[event_type].append(categories)
                        event2idx_list[event_type].append(idx)
                    else:
                        data_x.append(feature)
                        data_y.append(categories)
                else:
                    count_miss += 1

        utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format(count_miss, count_total, filename))
        if self.args.event_wise:
            for event_type in utils.idx2event_type:
                data_x[event_type] = np.asarray(data_x[event_type])
                data_y[event_type] = np.asarray(data_y[event_type])
            return data_x, data_y, event2idx_list, idx + 1
        else:
            return np.asarray(data_x), np.asarray(data_y)
예제 #6
0
    def _extract_score_from_formalized_file(self, filename: str):
        """
        For extracting score, we currently doesn't support event_wise.
        All other things are similar to `_extract_data_from_formalized_file_v2`.

        :param filename: File to extract information.
        :return:
        """
        data_x, data_y = [], []
        count_unk = 0
        with open(filename, 'r', encoding='utf8') as f:
            for idx, line in enumerate(f):
                line = line.strip().split('\t')
                tweetid = line[0]
                priority_label = line[2]
                if priority_label == 'Unknown':
                    count_unk += 1
                score = self.priority2score[line[2]]
                feature = self.tweetid2feature[tweetid]
                data_x.append(feature)
                data_y.append(score)
        utils.print_to_log("There are {} Unknown priority labels.".format(count_unk))
        return np.asarray(data_x), np.asarray(data_y)
예제 #7
0
파일: train.py 프로젝트: berniebear/trec_is
 def train_on_all(self):
     """
     A wrapper for train on all data, which is used to prepare for the prediction on test data.
     Notice that here we don't use cross-validation, because cv is only used for parameter-choosing.
     Now we have determined the parameter, and we want to train on all data we have (self.data_x and self.data_y)
     :return:
     """
     custom_postfix = '_{}'.format(
         self.event_type) if self.event_type is not None else ''
     model_save_name = '{0}_{1}.pkl'.format(self.args.model, custom_postfix)
     ckpt_file = os.path.join(self.args.model_dir, model_save_name)
     if os.path.isfile(ckpt_file) and not self.args.force_retrain:
         print_to_log(
             "The ckpt file exists, and force_retrain is not set, so load model from {}"
             .format(ckpt_file))
         with open(ckpt_file, 'rb') as f:
             self.clf = pickle.load(f)
     else:
         self._create_model()
         self._binarize_data_y()
         self._fit_data(self.data_x, self.data_y)
         with open(ckpt_file, 'wb') as f:
             pickle.dump(self.clf, f)
예제 #8
0
파일: train.py 프로젝트: berniebear/trec_is
    def _cross_validate(self, target_metric=None):
        """
        If we are performing event-wise training, we need to return the metrics for each running (event).
        Note: If you want to get more balanced k-fold split, you can refer to `proba_mass_split` in utils.py,
            or the `stratify_split` in utils.py which is implemented based on Sechidis et. al paper.

        For 2018 task, which uses any-type evaluation, you can use
            metric_results = utils.evaluate_any_type(y_test, y_predict, self.id2label)

        :param
            target_metric: If specified, it is the target metric that we care about during hyper-parameter tunining.
        :return:
        """
        print_to_log('Use {} fold cross validation'.format(self.args.cv_num))
        metric_values = {metric_name: [] for metric_name in self.metric_names}
        dev_predict = np.zeros_like(self.data_y, dtype=np.float)

        index_list = self._get_k_fold_index_list()
        for train_idx_list, test_idx_list in index_list:
            X_train = self.data_x[train_idx_list]
            y_train = self.data_y[train_idx_list]
            X_test = self.data_x[test_idx_list]
            y_test = self.data_y[test_idx_list]
            self._fit_data(X_train, y_train)
            predict_score = self._get_predict_score(X_test)
            dev_predict[test_idx_list] = predict_score

            metric_results = utils.evaluate_2019B(y_test, predict_score,
                                                  self.informative_label_idx,
                                                  self.args)
            for metric_name in self.metric_names:
                metric_values[metric_name].append(
                    [metric_results[metric_name],
                     len(y_test)])

        metric_weighted_avg = self._get_weighted_avg(metric_values)
        for metric_name in self.metric_names:
            print_to_log('The {0} score in cross validation is {1}'.format(
                metric_name, metric_values[metric_name]))
            print_to_log('The average {0} score is {1}'.format(
                metric_name, metric_weighted_avg[metric_name]))

        if self.args.search_best_parameters:
            return metric_weighted_avg[target_metric]

        return {
            metric_name: metric_weighted_avg[metric_name]
            for metric_name in self.metric_names
        }, dev_predict
예제 #9
0
파일: train.py 프로젝트: berniebear/trec_is
 def cross_validate(self):
     regressor = self._get_regressor()
     param_dist = {
         'alpha': [0.1, 0.5, 0.7, 1.0, 2.0, 5.0],
         'fit_intercept': [True, False]
     }
     search = GridSearchCV(regressor,
                           param_grid=param_dist,
                           cv=5,
                           verbose=10,
                           scoring='neg_mean_squared_error')
     search.fit(self.data_x, self.data_y)
     print_to_log("Grid Search for Regression finished!")
     print_to_log("best_score_:\n{}".format(search.best_score_))
     print_to_log("best_params_:\n{}".format(search.best_params_))
예제 #10
0
파일: train.py 프로젝트: berniebear/trec_is
    def predict(self, data_x: np.ndarray, tweetid_list: list,
                tweetid2idx: list, tweetid2incident: dict, id2label: list,
                short2long_label: dict, majority_label: str, out_file: str):
        """
        For those missed tweetid (that cannot be found in twitter API), we use the majority label as the prediction res.
        As we can see in the evaluation script, the rank filed doesn't matter.

        :param data_x: Feature of data
        :param tweetid_list:
        :param tweetid2idx: Can find the actuall idx of this tweetid in data_x
        :param tweetid2incident:
        :param id2label:
        :param short2long_label: the output format need the long label in the form of A-B
        :param majority_label:
        :param out_file:
        :return:
        """
        fout = open(out_file, 'w', encoding='utf8')
        predict_res = self._predict_data(data_x)
        count_label = []
        for tweetid in tweetid_list:
            incident = tweetid2incident[tweetid]
            label = id2label[predict_res[tweetid2idx[
                tweetid]]] if tweetid in tweetid2idx else majority_label
            label = short2long_label[label]
            fout.write("{0}\tQ0\t{1}\t1\t1.0\t{2}\tmyrun\n".format(
                incident, tweetid, label))
            count_label.append({"tweet_id": tweetid, "label": label})
        fout.close()
        df = pd.DataFrame(count_label)
        print_to_log(
            "{} rows have been replaced due to missing of tweetid".format(
                len(tweetid_list) - len(tweetid2idx)))
        print_to_log(
            "The count of different labels in prediction results:\n{}".format(
                df.groupby("label").count()))
        print_to_log(
            "The prediction file has been written to {}".format(out_file))
예제 #11
0
 def _collect_feature(self, feature, feat_name):
     self.feature_collection.append(feature)
     utils.print_to_log("The shape of {0}_feature is {1}".format(feat_name, feature.shape))
예제 #12
0
파일: train.py 프로젝트: berniebear/trec_is
    def _search_by_our_own(self, n_iter):
        """
        Call our own class method to perform the random search
        The drawback is that they cannot be performed paralleled
        :param n_iter:
        :return:
        """
        if self.args.model == 'rf':
            param_dist = {
                "max_depth": [2, 4, 8, 16, 32, 64, 128, None],
                "max_features":
                scipy.stats.randint(1, 512),
                "min_samples_split":
                scipy.stats.randint(2, 512),
                "min_samples_leaf":
                scipy.stats.randint(2, 512),
                "criterion": ["gini", "entropy"],
                "n_estimators": [128],
                "class_weight": [self.class_weight],
                "n_jobs":
                [1 if self.args.class_weight_scheme == 'balanced' else 4],
            }
        elif self.args.model == 'bernoulli_nb':
            param_dist = {
                "alpha": scipy.stats.uniform(),
                "binarize": scipy.stats.uniform(),
                "fit_prior": [True, False],
            }
        elif self.args.model == 'svm_linear':
            param_dist = {
                "penalty": ['l1', 'l2'],
                "C": [0.1, 1, 10, 100, 1000],
                "class_weight": [self.class_weight],
                "dual": [False],
            }
        elif self.args.model == 'xgboost':
            param_dist = {
                "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
                "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1],
                "n_estimators": [100, 300, 500],
                "subsample": [0.8, 0.9, 1.0],
                "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                "gamma": [0, 1, 5],
                "n_jobs": [1],
            }
        else:
            raise ValueError(
                "The model {} doesn't support parameter search in current stage"
                .format(self.args.model))

        if self.args.model == 'svm_linear':
            param_list = list(ParameterGrid(param_dist))
        else:
            param_list = list(ParameterSampler(param_dist, n_iter=n_iter))

        metric_name = 'high_prior_f1'
        best_metric = float("-inf")
        best_param = dict()
        for i, param in enumerate(param_list):
            if i < self.args.search_skip:
                continue
            print_to_log("Using the parameter set: {}".format(param))
            self._create_model(param)
            current_metric = self._cross_validate(target_metric=metric_name)
            if current_metric > best_metric:
                best_metric = current_metric
                best_param = param
            if (i + 1) % self.args.search_print_interval == 0:
                print_to_log(
                    "After searching {0} sets of parameters, current best is {1}, best {3} is {2}"
                    .format(i + 1, best_param, best_metric, metric_name))

        print_to_log("The Random search finished!")
        print_to_log("The best {0} is {1}".format(metric_name, best_metric))
        print_to_log("The best parameter is {}".format(best_param))
        quit()
예제 #13
0
파일: train.py 프로젝트: berniebear/trec_is
    def _search_by_sklearn(self, n_iter):
        """ Use the RandomizedSearchCV API of sklearn, but need to customize the scoring function.

        The advantage is that it parallelized well (However, according to the warning
            "Multiprocessing-backed parallel loops cannot be nested", if the model is parallelized,
            the random search will be serielized automatically). Because parallel jobs cannot be nested,
            we can set model to be paralled and search to be sequential, or model to be sequential
            but search to be parallel.
        Note that as the model clf is stored as an attribute named estimator inside the OneVsRestClassifier model,
            we should add "estimator__" as prefix for setting their parameters in the OneVsRestClassifier wrapper.

        WARNING: this function has been deprecated because it is not compatible with the new contest in 2019.

        :param n_iter: The number of iterations for searching parameters.
        :return:
        """
        if self.args.model == 'rf':
            clf = RandomForestClassifier(n_estimators=128,
                                         class_weight=self.class_weight,
                                         n_jobs=1)
            param_dist = {
                "max_depth": [2, 4, 8, 16, 32, 64, 128, None],
                "max_features": scipy.stats.randint(1, 512),
                "min_samples_split": scipy.stats.randint(2, 512),
                "min_samples_leaf": scipy.stats.randint(2, 512),
                "criterion": ["gini", "entropy"],
            }
        elif self.args.model == 'bernoulli_nb':
            clf = BernoulliNB()
            param_dist = {
                "alpha": scipy.stats.uniform(),
                "binarize": scipy.stats.uniform(),
                "fit_prior": [True, False],
            }
        elif self.args.model == 'svm_linear':
            clf = CalibratedClassifierCV(LinearSVC())
            param_dist = {
                "penalty": ['l1', 'l2'],
                "C": [0.1, 1, 10, 100, 1000],
                "class_weight": [self.class_weight],
                "dual": [False],
            }
        elif self.args.model == 'xgboost':
            clf = XGBClassifier()
            param_dist = {
                "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
                "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1],
                "n_estimators": [100, 300, 500],
                "subsample": [0.8, 0.9, 1.0],
                "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                "gamma": [0, 1, 5],
                "n_jobs": [1],
            }
        else:
            raise ValueError(
                "The model {} doesn't support parameter search in current stage"
                .format(self.args.model))

        param_dist = {
            "estimator__{}".format(k): v
            for k, v in param_dist.items()
        }
        kf = KFold(n_splits=self.args.cv_num,
                   random_state=self.args.random_seed)
        # Notice that as we use clf.predict_proba in our cross-validation, we need to set needs_proba=True here
        scorer = make_scorer(anytype_f1_scorer,
                             greater_is_better=True,
                             needs_proba=True,
                             id2label=self.id2label)
        if self.args.model == 'svm_linear':
            search = GridSearchCV(clf,
                                  param_grid=param_dist,
                                  cv=kf,
                                  scoring=scorer,
                                  n_jobs=1,
                                  verbose=10)
        else:
            search = RandomizedSearchCV(clf,
                                        param_distributions=param_dist,
                                        n_iter=n_iter,
                                        cv=kf,
                                        scoring=scorer,
                                        n_jobs=1,
                                        verbose=10)

        search.fit(self.data_x, self.data_y)

        print_to_log("Random Search finished!")
        print_to_log("best_score_:\n{}".format(search.best_score_))
        print_to_log("best_params_:\n{}".format(search.best_params_))
        quit()
예제 #14
0
파일: train.py 프로젝트: berniebear/trec_is
    def _create_single_model(self, param=None):
        """
        All parameters are searched by cross-validation.

        Here are some parameters we used in 2018 settings, which has been depricated:
            [bernoulli_nb]:
                if self.args.class_weight_scheme == 'balanced':
                    param = {'alpha': 0.8490, 'binarize': 0.3086, 'fit_prior': True}
                else:
                    param = {'alpha': 0.4974, 'binarize': 0.7751, 'fit_prior': True}

        :param param:
        :return:
        """
        model_name = self.args.model
        print_to_log("The model used here is {0}".format(model_name))
        if model_name == 'sgd_svm':
            clf = SGDClassifier(max_iter=1000,
                                tol=1e-3,
                                loss='hinge',
                                class_weight=self.class_weight)
        elif model_name == 'svm_linear':
            if not param:
                param = {
                    'class_weight': self.class_weight,
                    "C": 0.1,
                    "dual": False,
                    "penalty": "l2"
                }
            clf = CalibratedClassifierCV(LinearSVC(
                **param))  # Set dual=False when training num >> feature num
        elif model_name == 'svm_rbf':
            clf = SVC(kernel='rbf',
                      class_weight=self.class_weight,
                      gamma='auto',
                      probability=True)
        elif model_name == 'svm_rbf_scale':
            clf = SVC(kernel='rbf',
                      class_weight=self.class_weight,
                      gamma='scale',
                      probability=True)
        elif model_name == 'svm_chi2':
            clf = SVC(kernel=chi2_kernel,
                      class_weight=self.class_weight,
                      probability=True)
        elif model_name == 'gs_nb':
            clf = GaussianNB()
        elif model_name == 'bernoulli_nb':
            if not param:
                param = {
                    'alpha': 0.9916,
                    'binarize': 0.05695,
                    'fit_prior': True
                }
            clf = BernoulliNB(**param)
        elif model_name == 'rf':
            if not param:
                if self.args.class_weight_scheme == 'balanced':
                    param = {
                        'n_estimators': 128,
                        "n_jobs": self.args.n_jobs,
                        'class_weight': self.class_weight,
                        'criterion': 'gini',
                        'max_depth': 64,
                        'max_features': 213,
                        'min_samples_leaf': 5,
                        'min_samples_split': 43,
                    }
                else:
                    param = {
                        'n_estimators': 128,
                        "n_jobs": self.args.n_jobs,
                        'class_weight': self.class_weight,
                        'criterion': 'gini',
                        'max_depth': 64,
                        'max_features': 494,
                        'min_samples_leaf': 24,
                        'min_samples_split': 207,
                    }
            clf = RandomForestClassifier(**param)
        elif model_name == 'xgboost':
            if not param:
                param = {
                    'subsample': 0.9,
                    'n_jobs': 1,
                    'n_estimators': 500,
                    'max_depth': 8,
                    'learning_rate': 0.05,
                    'gamma': 0,
                    'colsample_bytree': 0.9,
                }
            clf = XGBClassifier(**param)
        else:
            raise NotImplementedError

        # In current version of sklearn (0.21), it doesn't support OneVsRestClassifier + customized class weight.
        if self.args.class_weight_scheme == 'balanced':
            return OneVsRestClassifier(clf, n_jobs=self.args.n_jobs)
        else:
            return clf
예제 #15
0
def evaluate(classificationLabelFiles, runFile, ontologyFile, out_dir):
    # --------------------------------------------------
    # TREC IS 2018 Evaluation Script
    # Used to evaluate TREC-IS runs
    # --------------------------------------------------
    version = 1.0  # Notebook Version Number
    print_to_log("Start evaluation ...")

    # Configuration (Change this to match your setting)
    # System output file to evaluate:
    # runFile = "../out/predict.txt.gz"
    runName = "myrun"

    # The location of the ground truth data against which to compare the run
    # classificationLabelFiles = [
    #     "assr1.test",
    #     "assr2.test",
    #     "assr3.test",
    #     "assr4.test",
    #     "assr5.test",
    #     "assr6.test"
    # ]
    # classificationLabelFiles = ['../data/TRECIS-2018-TestEvents-Labels/' + filename for filename in
    #                             classificationLabelFiles]

    # The location of the ontology file
    # ontologyFile = "../data/" + "ITR-H.types.v2.json"

    # --------------------------------------------------
    # Static data for the 2018 edition
    # --------------------------------------------------
    # Identifiers for the test events
    eventIdentifiers = [
        "albertaFloods2013", "australiaBushfire2013", "bostonBombings2013",
        "chileEarthquake2014", "flSchoolShooting2018",
        "guatemalaEarthquake2012", "italyEarthquakes2012", "joplinTornado2011",
        "manilaFloods2013", "nepalEarthquake2015", "parisAttacks2015",
        "philipinnesFloods2012", "queenslandFloods2013", "typhoonHagupit2014",
        "typhoonYolanda2013"
    ]

    # Mapping of prority labels (by assessors) into numerical values [0-1]
    # We use this to calculate error against the participant priority scores
    priorityMapping = {
        "Critical": 1.0,
        "High": 0.75,
        "Medium": 0.5,
        "Low": 0.25
    }

    resultsFile = open(os.path.join(out_dir, runName + ".results.overall.txt"),
                       "w+")
    resultsFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) +
                      "\n")
    resultsFile.write("Run: " + runName + " (" + runFile + ")" + "\n")
    resultsFile.write("" + "\n")

    perTopicFile = open(
        os.path.join(out_dir, runName + ".results.pertopic.txt"), "w+")
    perTopicFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) +
                       "\n")
    perTopicFile.write("Run: " + runName + " (" + runFile + ")" + "\n")
    perTopicFile.write("" + "\n")

    perEventFile = open(
        os.path.join(out_dir, runName + ".results.perevent.txt"), "w+")
    perEventFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) +
                       "\n")
    perEventFile.write("Run: " + runName + " (" + runFile + ")" + "\n")
    perEventFile.write("" + "\n")

    # --------------------------------------------------
    # Processing Starts Here
    # --------------------------------------------------
    import json
    from pprint import pprint
    import gzip

    # --------------------------------------------------
    # Stage 1: Load the ground truth dataset
    # --------------------------------------------------

    groundtruthJSON = []
    for groundtruthFile in classificationLabelFiles:
        print_to_log("Reading " + groundtruthFile)
        with open(groundtruthFile, encoding='utf-8') as groundtruthJSONFile:
            groundtruthJSON.append(json.load(groundtruthJSONFile))
    # pprint(groundtruthJSON["events"])

    # --------------------------------------------------
    # Stage 2: Load run file (assumes gzip)
    # --------------------------------------------------
    with gzip.open(runFile, 'rb') as openRunFile:
        runContents = openRunFile.readlines()  # lines not yet decoded
    # pprint(runContents[0])

    # --------------------------------------------------
    # Stage 3: Load the categories
    # --------------------------------------------------
    with open(ontologyFile, encoding='utf-8') as ontologyJSONFile:
        ontologyJSON = json.load(ontologyJSONFile)

    informationTypes2Index = {}  # category -> numerical index
    informationTypesShort2Index = {
    }  # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index
    informationTypeIndex = 0
    for informationTypeJSON in ontologyJSON["informationTypes"]:
        informationTypeId = informationTypeJSON["id"]
        informationTypes2Index[informationTypeId] = informationTypeIndex
        informationTypesShort2Index[informationTypeId.split("-")
                                    [1]] = informationTypeIndex
        informationTypeIndex = informationTypeIndex + 1

    # -----------------------------------------------------------
    # Stage 4: Produce ground truth maps between tweetIds and categories
    # -----------------------------------------------------------
    # Notes: Ground truth is used as a base, if a run includes tweets
    #        not in the ground truth they will be ignored
    # Assumptions: A tweet will not be returned for multiple events

    tweetId2InfoCategories = {
    }  # tweet id -> Array of categories selected by assessors
    tweetId2PriorityCategory = {
    }  # tweet id -> priority label (Critical,High,Medium,Low)
    index2TweetId = {}  # ordered tweets
    event2tweetIds = {}  # event -> tweet ids for tweets within that event

    tweetIndex = 0
    for groundtruth in groundtruthJSON:
        for eventJSON in groundtruth["events"]:
            eventid = eventJSON["eventid"]

            # two events were split and assessed in parts, re-name these so they are correctly read
            if eventid.endswith("S1") | eventid.endswith(
                    "S2") | eventid.endswith("S3") | eventid.endswith("S4"):
                eventid = eventid[:-2]

            if not event2tweetIds.get(eventid):
                event2tweetIds[eventid] = []

            if any(eventid in s for s in eventIdentifiers):
                # iterate over tweets in the event
                for tweetJSON in eventJSON["tweets"]:
                    tweetid = tweetJSON["postID"]
                    categories = tweetJSON["categories"]
                    priority = tweetJSON["priority"]

                    event2tweetIds[eventid].append(tweetid)

                    # check categories for name issues and correct if possible
                    for categoryId in categories:
                        if not any(
                                categoryId in s
                                for s in informationTypesShort2Index.keys()):
                            print_to_log("Found unknown category " +
                                         categoryId)

                    tweetId2InfoCategories[tweetid] = categories
                    tweetId2PriorityCategory[tweetid] = priority
                    index2TweetId[tweetIndex] = tweetid
                    tweetIndex = tweetIndex + 1
            else:
                print_to_log(
                    "WARN: Found ground truth data for event not in the 2018 topic set "
                    + eventid + ", ignoring...", 'warning')

    # -----------------------------------------------------------
    # Stage 5: Produce run predicted maps between tweetIds and categories
    # -----------------------------------------------------------
    tweetId2RunInfoCategory = {
    }  # tweet id -> predicted category by participant system
    tweetId2RunPriorityCategory = {
    }  # tweet id -> importance score from participant system

    maxPrediction = -999999
    minPrediction = 999999

    for runLine in runContents:
        predictionParts = runLine.decode("utf-8").replace("\t", " ").split(" ")
        if (len(predictionParts) < 6):
            continue
        else:
            tweetId = predictionParts[2]
            category = predictionParts[5]
            priority = predictionParts[4]

            priorityNum = float(priority)
            if (maxPrediction < priorityNum):
                maxPrediction = priorityNum
            if (minPrediction > priorityNum):
                minPrediction = priorityNum

            tweetId2RunInfoCategory[tweetId] = category
            tweetId2RunPriorityCategory[tweetId] = priority

    # --------------------------------------------------
    # Stage 6: Create ground truth vectors per category
    # --------------------------------------------------

    category2GroundTruth = {
    }  # category -> tweet vector with binary 1 vs all ground truth category labels

    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
        for i in range(len(index2TweetId)):
            tweetId = index2TweetId[i]
            categories = tweetId2InfoCategories.get(tweetId)
            # pprint(categories)
            if any(categoryIdShort in s for s in categories):
                categoryVector.append(1)
            else:
                categoryVector.append(0)
        category2GroundTruth[categoryId] = categoryVector

    # pprint(category2GroundTruth)

    # --------------------------------------------------
    # Stage 7: Create run vectors per category
    # --------------------------------------------------
    # Assumptions: If run misses a tweet, we assume id has
    #              no categories
    category2Predicted = {
    }  # category -> tweet vector with binary 1 vs all predicted by system labels

    for categoryId in informationTypes2Index.keys():
        categoryVector = []
        for i in range(len(index2TweetId)):
            tweetId = index2TweetId[i]

            if tweetId2RunInfoCategory.get(tweetId):
                category = tweetId2RunInfoCategory.get(tweetId)
                if category == categoryId:
                    categoryVector.append(1)
                else:
                    categoryVector.append(0)
            else:
                categoryVector.append(0)

        category2Predicted[categoryId] = categoryVector

    # pprint(category2Predicted)

    # --------------------------------------------------
    # Stage 8: Make event category vectors
    # --------------------------------------------------

    event2groundtruth = {
    }  # event -> category -> tweet vector with binary 1 vs all ground truth category labels
    for eventId in eventIdentifiers:
        eventCategories = {}
        for categoryId in informationTypes2Index.keys():
            categoryIdShort = categoryId.split("-")[1]
            categoryVector = []
            for tweetId in event2tweetIds.get(eventId):
                category = tweetId2RunInfoCategory.get(tweetId)
                if category == categoryId:
                    categoryVector.append(1)
                else:
                    categoryVector.append(0)

            eventCategories[categoryId] = categoryVector
        event2groundtruth[eventId] = eventCategories

    event2prediction = {
    }  # event -> category -> tweet vector with binary 1 vs all predicted by system labels
    for eventId in eventIdentifiers:
        eventCategories = {}
        for categoryId in informationTypes2Index.keys():
            categoryIdShort = categoryId.split("-")[1]
            categoryVector = []
            for tweetId in event2tweetIds.get(eventId):
                categories = tweetId2InfoCategories.get(tweetId)
                if any(categoryIdShort in s for s in categories):
                    categoryVector.append(1)
                else:
                    categoryVector.append(0)

            eventCategories[categoryId] = categoryVector
        event2prediction[eventId] = eventCategories

    # -----------------------------------------------------------
    # Stage 9: Make priority classification vectors
    # -----------------------------------------------------------

    category2GroundTruthPriority = {
    }  # category -> tweet vector with binary 1 vs all ground truth priority labels

    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        priorityVector = []
        for i in range(len(index2TweetId)):
            tweetId = index2TweetId[i]
            categories = tweetId2InfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                priority = tweetId2PriorityCategory.get(tweetId)
                priorityAsNumber = priorityMapping[priority]
                priorityVector.append(priorityAsNumber)
        category2GroundTruthPriority[categoryId] = priorityVector

    category2PredictedPriority = {
    }  # category -> tweet vector with binary 1 vs all predicted by system labels

    for categoryId in informationTypes2Index.keys():
        categoryIdShort = categoryId.split("-")[1]
        categoryVector = []
        for i in range(len(index2TweetId)):
            tweetId = index2TweetId[i]
            categories = tweetId2InfoCategories.get(tweetId)
            if any(categoryIdShort in s for s in categories):
                if tweetId2RunPriorityCategory.get(tweetId):
                    priority = float(tweetId2RunPriorityCategory.get(tweetId))

                    # scale to between 0 and 1
                    if (maxPrediction - minPrediction) > 0:
                        normPriority = (priority - minPrediction) / (
                            maxPrediction - minPrediction)
                    else:
                        normPriority = 0.0
                    # bound by min and max on ground truth
                    if (normPriority < priorityMapping["Low"]):
                        normPriority = priorityMapping["Low"]
                    if (normPriority > priorityMapping["Critical"]):
                        normPriority = priorityMapping["Critical"]

                    categoryVector.append(normPriority)
                else:
                    categoryVector.append(
                        priorityMapping["Low"])  # default to low priority

        category2PredictedPriority[categoryId] = categoryVector

    # --------------------------------------------------
    # EVALUATON 1: Information Type Categorization (Multi-type)
    # Overall performance
    # --------------------------------------------------
    # Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them
    # Average performance over information types
    # Macro averaged (information types have equal weight)
    # Positive class is the target class

    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score

    avgPrecision = 0.0
    avgRecall = 0.0
    avgF1 = 0.0
    avgAccuracy = 0.0

    for categoryId in informationTypes2Index.keys():
        avgPrecision = avgPrecision + precision_score(
            category2GroundTruth[categoryId],
            category2Predicted[categoryId],
            average='binary')
        avgRecall = avgRecall + recall_score(category2GroundTruth[categoryId],
                                             category2Predicted[categoryId],
                                             average='binary')
        avgF1 = avgF1 + f1_score(category2GroundTruth[categoryId],
                                 category2Predicted[categoryId],
                                 average='binary')
        avgAccuracy = avgAccuracy + accuracy_score(
            category2GroundTruth[categoryId], category2Predicted[categoryId])

    print_to_log(
        "Information Type Precision (positive class, multi-type, macro): " +
        str(avgPrecision / len(informationTypes2Index)))
    print_to_log(
        "Information Type Recall (positive class, multi-type, macro): " +
        str(avgRecall / len(informationTypes2Index)))
    print_to_log("Information Type F1 (positive class, multi-type, macro): " +
                 str(avgF1 / len(informationTypes2Index)))
    print_to_log("Information Type Accuracy (overall, multi-type, macro): " +
                 str(avgAccuracy / len(informationTypes2Index)))

    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write(
        "EVALUATON: Information Type Categorization (Multi-type)" + "\n")
    resultsFile.write("Overall performance" + "\n")
    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write(
        "> Information Type Precision (positive class, multi-type, macro):" +
        "\t" + str(avgPrecision / len(informationTypes2Index)) + "\n")
    resultsFile.write(
        "> Information Type Recall (positive class, multi-type, macro):" +
        "\t" + str(avgRecall / len(informationTypes2Index)) + "\n")
    resultsFile.write(
        "> Information Type F1 (positive class, multi-type, macro):" + "\t" +
        str(avgF1 / len(informationTypes2Index)) + "\n")
    resultsFile.write(
        "> Information Type Accuracy (overall, multi-type, macro):" + "\t" +
        str(avgAccuracy / len(informationTypes2Index)) + "\n")
    resultsFile.write("" + "\n")

    # --------------------------------------------------
    # EVALUATON 6: Information Priority Level
    # Overall Performance
    # --------------------------------------------------
    # Average error over information types, lower is better
    # Macro average (categories have equal weight)

    from sklearn.metrics import mean_squared_error

    avgMSE = 0.0
    for categoryId in informationTypes2Index.keys():
        groundTruthPriorities = category2GroundTruthPriority[categoryId]
        predictedPriorities = category2PredictedPriority[categoryId]

        error = mean_squared_error(groundTruthPriorities, predictedPriorities)
        avgMSE = avgMSE + error

    print_to_log("Priority Estimation Error (mean squared error, macro): " +
                 str(avgMSE / len(informationTypes2Index)))

    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write("EVALUATON: Information Priority Level" + "\n")
    resultsFile.write("Overall Performance" + "\n")
    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write(
        "> Priority Estimation Error (mean squared error, macro): " +
        str(avgMSE / len(informationTypes2Index)) + "\n")
    resultsFile.write("\n")

    # --------------------------------------------------
    # EVALUATON 8: Information Type Categorization (Any-type)
    # Overall Performance (Micro Average)
    # --------------------------------------------------
    # Any-type: Tweets have multiple information types, aim: predict any one of them
    # Categorization performance where a system gets full
    # score if it picked any of the information types that
    # the human assessor selected.
    # Micro Average (more common information types have higher weight)

    truePositive = 0  # system predicted any of the categories selected by the human assessor
    trueNegative = 0  # system and human assessor both selected either Other-Irrelevant or Other-Unknown
    falsePositive = 0  # system failed to predict any of the categories selected by the human assessor
    falseNegative = 0  # human assessor selected either Other-Irrelevant or Other-Unknown but the system prediced something different

    for i in range(len(index2TweetId)):
        tweetId = index2TweetId[i]

        groundTruthCategories = tweetId2InfoCategories.get(tweetId)
        predictedCategory = tweetId2RunInfoCategory.get(tweetId).split("-")[1]

        categoryMatchFound = False

        isNegativeExample = False
        if (predictedCategory == "Irrelevant") | (predictedCategory
                                                  == "Unknown"):
            isNegativeExample = True

        for groundTruthCategory in groundTruthCategories:
            if (groundTruthCategory == predictedCategory):
                categoryMatchFound = True

        if categoryMatchFound & isNegativeExample:
            trueNegative = trueNegative + 1
        if categoryMatchFound & (not isNegativeExample):
            truePositive = truePositive + 1
        if (not categoryMatchFound) & isNegativeExample:
            falseNegative = falseNegative + 1
        if (not categoryMatchFound) & (not isNegativeExample):
            falsePositive = falsePositive + 1

    # print_to_log (str(truePositive)+" "+str(trueNegative)+" "+str(falsePositive)+" "+str(falseNegative))

    precision = truePositive / (truePositive + falsePositive)
    recall = truePositive / (truePositive + falseNegative)

    print_to_log("Information Type Precision (any valid type, micro): " +
                 str(precision))
    print_to_log("Information Type Recall (any valid type, micro): " +
                 str(recall))

    f1 = 2 * ((precision * recall) / (precision + recall))
    accuracy = (truePositive + trueNegative) / (truePositive + trueNegative +
                                                falsePositive + falseNegative)

    print_to_log("Information Type F1 (any valid type, micro): " + str(f1))
    print_to_log("Information Type Accuracy (any valid type, micro): " +
                 str(accuracy))

    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write("EVALUATON: Information Type Categorization (Any-type)" +
                      "\n")
    resultsFile.write("Overall Performance (Micro Average)" + "\n")
    resultsFile.write("--------------------------------------------------" +
                      "\n")
    resultsFile.write(
        "> Information Type Precision (any valid type, micro): " +
        str(precision) + "\n")
    resultsFile.write("> Information Type Recall (any valid type, micro): " +
                      str(recall) + "\n")
    resultsFile.write("> Information Type F1 (any valid type, micro): " +
                      str(f1) + "\n")
    resultsFile.write("> Information Type Accuracy (any valid type, micro): " +
                      str(accuracy) + "\n")

    resultsFile.close()
    perTopicFile.close()
    perEventFile.close()
    print_to_log("All evaluation steps finished")