def extract_test_data(self, test_file: str): """ Note: This function is deprecated, which is used for the old setting (train on 2018-train and test on 2018-test) This function extracts only X data for testing (assume labels are invisible for us) It also returns many other auxiliary things which are useful during prediction :param test_file: :return: """ tweetid_list = [] miss_tweetid = [] tweetid2idx = dict() tweetid2incident = dict() test_x = [] with open(test_file, 'r', encoding='utf8') as f: for line in f: line = line.strip().split('\t') incident_id, tweetid = line[0], line[2] tweetid_list.append(tweetid) tweetid2incident[tweetid] = incident_id tweetid_list = list(set(tweetid_list)) # Remove some duplicate tweets for tweetid in tweetid_list: if tweetid in self.tweetid2feature: tweetid2idx[tweetid] = len(test_x) test_x.append(self.tweetid2feature[tweetid]) else: miss_tweetid.append(tweetid) utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format( len(miss_tweetid), len(tweetid_list), test_file)) test_x = np.asarray(test_x) utils.print_to_log("The shape of test_x is {0}".format(test_x.shape)) return test_x, tweetid_list, tweetid2idx, tweetid2incident
def __init__(self, args, tweetid_list: List[str], tweet_content_list: List[dict], label2id: dict, tweet_id_out_file: str, test=False): """ Use feature_used to control which features are used for sentence-level feature extraction. Currently available features: ['hand_crafted', 'fasttext-avg', 'fasttext-tfidf', 'glove-avg', 'glove-tfidf', 'cbnu_user_feature', 'skip-thought', 'bert-avg/CLS-1/4/8', 'fasttext-crawl', 'fasttext-1_2M-balanced-event', 'hashtag'] :param args: :param tweetid_list: :param tweet_content_list: :param label2id: """ self.args = args self.tweetid_list = tweetid_list self.tweet_content_list = tweet_content_list self.annotated_user_type = None self.label2id = label2id self.tweet_id_out_file = tweet_id_out_file self.test = test self.train_tweet = [] self.train_label = [] self.tweetid2feature = dict() self.feature_len = None self.feature_collection = [] self.feature_used = ['hand_crafted', 'fasttext-avg', 'skip-thought', 'bert-avg-1', 'bert-CLS-1', 'glove-tfidf', 'fasttext-crawl'] # Convert the priority label to score which will be used to train regression model. self.priority2score = {'Low': 0.25, 'Medium': 0.5, 'High': 0.75, 'Critical': 1.0, 'Unknown': 0.5} utils.print_to_log("The feature used is {}".format(self.feature_used))
def _extract_data_from_formalized_file_single_label(self, filename: str): """ Note: This function is deprecated, because now we focus on multi-label model, and to make it consistent with the official evaluation file, we need our ground truth label in the form of multi-label Notice that each tweet may have several labels, and we use each of them to construct a training instance :param filename: The filename of formalized file, where each line is "{tweetid}\t{labels}\t{priority}}" :return: """ count_miss = 0 count_total = 0 data_x, data_y = [], [] with open(filename, 'r', encoding='utf8') as f: for line in f: line = line.strip().split('\t') tweetid = line[0] categories = line[1].split(',') count_total += 1 if tweetid in self.tweetid2feature: feature = self.tweetid2feature[tweetid] for tweet_label in categories: if tweet_label not in self.label2id: continue data_x.append(feature) data_y.append(self.label2id[tweet_label]) else: count_miss += 1 utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format(count_miss, count_total, filename)) data_x, data_y = np.asarray(data_x), np.asarray(data_y, dtype=np.int32) print("The shape of data_x is {0}, shape of data_y is {1}".format(data_x.shape, data_y.shape)) return data_x, data_y
def _simple_cross_validate(self): """ Use a simple fixed NB model to double check the correctness of sklearn Random search and my random search It can confirm our API compatible with late-fusion is correct :return: """ kf = KFold(n_splits=self.args.cv_num, random_state=self.args.random_seed) metric_values = {metric_name: [] for metric_name in self.metric_names} clf = BernoulliNB(alpha=0.8490, binarize=0.3086, fit_prior=True) clf = OneVsRestClassifier(clf, n_jobs=self.args.n_jobs) for train_idx_list, test_idx_list in kf.split(self.data_x, self.data_y): X_train = self.data_x[train_idx_list] y_train = self.data_y[train_idx_list] X_test = self.data_x[test_idx_list] y_test = self.data_y[test_idx_list] clf.fit(X_train, y_train) y_predict_score = clf.predict_proba(X_test) y_predict = np.argmax(y_predict_score, axis=-1) metric_results = utils.evaluate_any_type(y_test, y_predict, self.id2label) for metric_name in self.metric_names: metric_values[metric_name].append( [metric_results[metric_name], len(y_test)]) metric_weighted_avg = self._get_weighted_avg(metric_values) for metric_name in ['f1']: print_to_log('The {0} score in cross validation is {1}'.format( metric_name, metric_values[metric_name])) print_to_log('The average {0} score is {1}'.format( metric_name, metric_weighted_avg[metric_name])) quit()
def _extract_data_from_formalized_file_v1(self, filename: str): """ This function is deprecated :param filename: :return: """ count_miss = 0 count_total = 0 if self.args.event_wise: data_x = {event_type: [] for event_type in utils.idx2event_type} data_y = {event_type: [] for event_type in utils.idx2event_type} event2idx_list = {event_type: [] for event_type in utils.idx2event_type} else: data_x, data_y = [], [] with open(filename, 'r', encoding='utf8') as f: for idx, line in enumerate(f): line = line.strip().split('\t') tweetid = line[0] event_type = line[3] # The 2018train + 2018test data will not filter out any label categories = [self.label2id[label] for label in line[1].split(',')] count_total += 1 if tweetid in self.tweetid2feature: feature = self.tweetid2feature[tweetid] if self.args.event_wise: data_x[event_type].append(feature) data_y[event_type].append(categories) event2idx_list[event_type].append(idx) else: data_x.append(feature) data_y.append(categories) else: count_miss += 1 utils.print_to_log("There are {0}/{1} tweets cannot find for {2}".format(count_miss, count_total, filename)) if self.args.event_wise: for event_type in utils.idx2event_type: data_x[event_type] = np.asarray(data_x[event_type]) data_y[event_type] = np.asarray(data_y[event_type]) return data_x, data_y, event2idx_list, idx + 1 else: return np.asarray(data_x), np.asarray(data_y)
def _extract_score_from_formalized_file(self, filename: str): """ For extracting score, we currently doesn't support event_wise. All other things are similar to `_extract_data_from_formalized_file_v2`. :param filename: File to extract information. :return: """ data_x, data_y = [], [] count_unk = 0 with open(filename, 'r', encoding='utf8') as f: for idx, line in enumerate(f): line = line.strip().split('\t') tweetid = line[0] priority_label = line[2] if priority_label == 'Unknown': count_unk += 1 score = self.priority2score[line[2]] feature = self.tweetid2feature[tweetid] data_x.append(feature) data_y.append(score) utils.print_to_log("There are {} Unknown priority labels.".format(count_unk)) return np.asarray(data_x), np.asarray(data_y)
def train_on_all(self): """ A wrapper for train on all data, which is used to prepare for the prediction on test data. Notice that here we don't use cross-validation, because cv is only used for parameter-choosing. Now we have determined the parameter, and we want to train on all data we have (self.data_x and self.data_y) :return: """ custom_postfix = '_{}'.format( self.event_type) if self.event_type is not None else '' model_save_name = '{0}_{1}.pkl'.format(self.args.model, custom_postfix) ckpt_file = os.path.join(self.args.model_dir, model_save_name) if os.path.isfile(ckpt_file) and not self.args.force_retrain: print_to_log( "The ckpt file exists, and force_retrain is not set, so load model from {}" .format(ckpt_file)) with open(ckpt_file, 'rb') as f: self.clf = pickle.load(f) else: self._create_model() self._binarize_data_y() self._fit_data(self.data_x, self.data_y) with open(ckpt_file, 'wb') as f: pickle.dump(self.clf, f)
def _cross_validate(self, target_metric=None): """ If we are performing event-wise training, we need to return the metrics for each running (event). Note: If you want to get more balanced k-fold split, you can refer to `proba_mass_split` in utils.py, or the `stratify_split` in utils.py which is implemented based on Sechidis et. al paper. For 2018 task, which uses any-type evaluation, you can use metric_results = utils.evaluate_any_type(y_test, y_predict, self.id2label) :param target_metric: If specified, it is the target metric that we care about during hyper-parameter tunining. :return: """ print_to_log('Use {} fold cross validation'.format(self.args.cv_num)) metric_values = {metric_name: [] for metric_name in self.metric_names} dev_predict = np.zeros_like(self.data_y, dtype=np.float) index_list = self._get_k_fold_index_list() for train_idx_list, test_idx_list in index_list: X_train = self.data_x[train_idx_list] y_train = self.data_y[train_idx_list] X_test = self.data_x[test_idx_list] y_test = self.data_y[test_idx_list] self._fit_data(X_train, y_train) predict_score = self._get_predict_score(X_test) dev_predict[test_idx_list] = predict_score metric_results = utils.evaluate_2019B(y_test, predict_score, self.informative_label_idx, self.args) for metric_name in self.metric_names: metric_values[metric_name].append( [metric_results[metric_name], len(y_test)]) metric_weighted_avg = self._get_weighted_avg(metric_values) for metric_name in self.metric_names: print_to_log('The {0} score in cross validation is {1}'.format( metric_name, metric_values[metric_name])) print_to_log('The average {0} score is {1}'.format( metric_name, metric_weighted_avg[metric_name])) if self.args.search_best_parameters: return metric_weighted_avg[target_metric] return { metric_name: metric_weighted_avg[metric_name] for metric_name in self.metric_names }, dev_predict
def cross_validate(self): regressor = self._get_regressor() param_dist = { 'alpha': [0.1, 0.5, 0.7, 1.0, 2.0, 5.0], 'fit_intercept': [True, False] } search = GridSearchCV(regressor, param_grid=param_dist, cv=5, verbose=10, scoring='neg_mean_squared_error') search.fit(self.data_x, self.data_y) print_to_log("Grid Search for Regression finished!") print_to_log("best_score_:\n{}".format(search.best_score_)) print_to_log("best_params_:\n{}".format(search.best_params_))
def predict(self, data_x: np.ndarray, tweetid_list: list, tweetid2idx: list, tweetid2incident: dict, id2label: list, short2long_label: dict, majority_label: str, out_file: str): """ For those missed tweetid (that cannot be found in twitter API), we use the majority label as the prediction res. As we can see in the evaluation script, the rank filed doesn't matter. :param data_x: Feature of data :param tweetid_list: :param tweetid2idx: Can find the actuall idx of this tweetid in data_x :param tweetid2incident: :param id2label: :param short2long_label: the output format need the long label in the form of A-B :param majority_label: :param out_file: :return: """ fout = open(out_file, 'w', encoding='utf8') predict_res = self._predict_data(data_x) count_label = [] for tweetid in tweetid_list: incident = tweetid2incident[tweetid] label = id2label[predict_res[tweetid2idx[ tweetid]]] if tweetid in tweetid2idx else majority_label label = short2long_label[label] fout.write("{0}\tQ0\t{1}\t1\t1.0\t{2}\tmyrun\n".format( incident, tweetid, label)) count_label.append({"tweet_id": tweetid, "label": label}) fout.close() df = pd.DataFrame(count_label) print_to_log( "{} rows have been replaced due to missing of tweetid".format( len(tweetid_list) - len(tweetid2idx))) print_to_log( "The count of different labels in prediction results:\n{}".format( df.groupby("label").count())) print_to_log( "The prediction file has been written to {}".format(out_file))
def _collect_feature(self, feature, feat_name): self.feature_collection.append(feature) utils.print_to_log("The shape of {0}_feature is {1}".format(feat_name, feature.shape))
def _search_by_our_own(self, n_iter): """ Call our own class method to perform the random search The drawback is that they cannot be performed paralleled :param n_iter: :return: """ if self.args.model == 'rf': param_dist = { "max_depth": [2, 4, 8, 16, 32, 64, 128, None], "max_features": scipy.stats.randint(1, 512), "min_samples_split": scipy.stats.randint(2, 512), "min_samples_leaf": scipy.stats.randint(2, 512), "criterion": ["gini", "entropy"], "n_estimators": [128], "class_weight": [self.class_weight], "n_jobs": [1 if self.args.class_weight_scheme == 'balanced' else 4], } elif self.args.model == 'bernoulli_nb': param_dist = { "alpha": scipy.stats.uniform(), "binarize": scipy.stats.uniform(), "fit_prior": [True, False], } elif self.args.model == 'svm_linear': param_dist = { "penalty": ['l1', 'l2'], "C": [0.1, 1, 10, 100, 1000], "class_weight": [self.class_weight], "dual": [False], } elif self.args.model == 'xgboost': param_dist = { "max_depth": [3, 4, 5, 6, 7, 8, 9, 10], "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1], "n_estimators": [100, 300, 500], "subsample": [0.8, 0.9, 1.0], "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "gamma": [0, 1, 5], "n_jobs": [1], } else: raise ValueError( "The model {} doesn't support parameter search in current stage" .format(self.args.model)) if self.args.model == 'svm_linear': param_list = list(ParameterGrid(param_dist)) else: param_list = list(ParameterSampler(param_dist, n_iter=n_iter)) metric_name = 'high_prior_f1' best_metric = float("-inf") best_param = dict() for i, param in enumerate(param_list): if i < self.args.search_skip: continue print_to_log("Using the parameter set: {}".format(param)) self._create_model(param) current_metric = self._cross_validate(target_metric=metric_name) if current_metric > best_metric: best_metric = current_metric best_param = param if (i + 1) % self.args.search_print_interval == 0: print_to_log( "After searching {0} sets of parameters, current best is {1}, best {3} is {2}" .format(i + 1, best_param, best_metric, metric_name)) print_to_log("The Random search finished!") print_to_log("The best {0} is {1}".format(metric_name, best_metric)) print_to_log("The best parameter is {}".format(best_param)) quit()
def _search_by_sklearn(self, n_iter): """ Use the RandomizedSearchCV API of sklearn, but need to customize the scoring function. The advantage is that it parallelized well (However, according to the warning "Multiprocessing-backed parallel loops cannot be nested", if the model is parallelized, the random search will be serielized automatically). Because parallel jobs cannot be nested, we can set model to be paralled and search to be sequential, or model to be sequential but search to be parallel. Note that as the model clf is stored as an attribute named estimator inside the OneVsRestClassifier model, we should add "estimator__" as prefix for setting their parameters in the OneVsRestClassifier wrapper. WARNING: this function has been deprecated because it is not compatible with the new contest in 2019. :param n_iter: The number of iterations for searching parameters. :return: """ if self.args.model == 'rf': clf = RandomForestClassifier(n_estimators=128, class_weight=self.class_weight, n_jobs=1) param_dist = { "max_depth": [2, 4, 8, 16, 32, 64, 128, None], "max_features": scipy.stats.randint(1, 512), "min_samples_split": scipy.stats.randint(2, 512), "min_samples_leaf": scipy.stats.randint(2, 512), "criterion": ["gini", "entropy"], } elif self.args.model == 'bernoulli_nb': clf = BernoulliNB() param_dist = { "alpha": scipy.stats.uniform(), "binarize": scipy.stats.uniform(), "fit_prior": [True, False], } elif self.args.model == 'svm_linear': clf = CalibratedClassifierCV(LinearSVC()) param_dist = { "penalty": ['l1', 'l2'], "C": [0.1, 1, 10, 100, 1000], "class_weight": [self.class_weight], "dual": [False], } elif self.args.model == 'xgboost': clf = XGBClassifier() param_dist = { "max_depth": [3, 4, 5, 6, 7, 8, 9, 10], "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1], "n_estimators": [100, 300, 500], "subsample": [0.8, 0.9, 1.0], "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "gamma": [0, 1, 5], "n_jobs": [1], } else: raise ValueError( "The model {} doesn't support parameter search in current stage" .format(self.args.model)) param_dist = { "estimator__{}".format(k): v for k, v in param_dist.items() } kf = KFold(n_splits=self.args.cv_num, random_state=self.args.random_seed) # Notice that as we use clf.predict_proba in our cross-validation, we need to set needs_proba=True here scorer = make_scorer(anytype_f1_scorer, greater_is_better=True, needs_proba=True, id2label=self.id2label) if self.args.model == 'svm_linear': search = GridSearchCV(clf, param_grid=param_dist, cv=kf, scoring=scorer, n_jobs=1, verbose=10) else: search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter, cv=kf, scoring=scorer, n_jobs=1, verbose=10) search.fit(self.data_x, self.data_y) print_to_log("Random Search finished!") print_to_log("best_score_:\n{}".format(search.best_score_)) print_to_log("best_params_:\n{}".format(search.best_params_)) quit()
def _create_single_model(self, param=None): """ All parameters are searched by cross-validation. Here are some parameters we used in 2018 settings, which has been depricated: [bernoulli_nb]: if self.args.class_weight_scheme == 'balanced': param = {'alpha': 0.8490, 'binarize': 0.3086, 'fit_prior': True} else: param = {'alpha': 0.4974, 'binarize': 0.7751, 'fit_prior': True} :param param: :return: """ model_name = self.args.model print_to_log("The model used here is {0}".format(model_name)) if model_name == 'sgd_svm': clf = SGDClassifier(max_iter=1000, tol=1e-3, loss='hinge', class_weight=self.class_weight) elif model_name == 'svm_linear': if not param: param = { 'class_weight': self.class_weight, "C": 0.1, "dual": False, "penalty": "l2" } clf = CalibratedClassifierCV(LinearSVC( **param)) # Set dual=False when training num >> feature num elif model_name == 'svm_rbf': clf = SVC(kernel='rbf', class_weight=self.class_weight, gamma='auto', probability=True) elif model_name == 'svm_rbf_scale': clf = SVC(kernel='rbf', class_weight=self.class_weight, gamma='scale', probability=True) elif model_name == 'svm_chi2': clf = SVC(kernel=chi2_kernel, class_weight=self.class_weight, probability=True) elif model_name == 'gs_nb': clf = GaussianNB() elif model_name == 'bernoulli_nb': if not param: param = { 'alpha': 0.9916, 'binarize': 0.05695, 'fit_prior': True } clf = BernoulliNB(**param) elif model_name == 'rf': if not param: if self.args.class_weight_scheme == 'balanced': param = { 'n_estimators': 128, "n_jobs": self.args.n_jobs, 'class_weight': self.class_weight, 'criterion': 'gini', 'max_depth': 64, 'max_features': 213, 'min_samples_leaf': 5, 'min_samples_split': 43, } else: param = { 'n_estimators': 128, "n_jobs": self.args.n_jobs, 'class_weight': self.class_weight, 'criterion': 'gini', 'max_depth': 64, 'max_features': 494, 'min_samples_leaf': 24, 'min_samples_split': 207, } clf = RandomForestClassifier(**param) elif model_name == 'xgboost': if not param: param = { 'subsample': 0.9, 'n_jobs': 1, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.9, } clf = XGBClassifier(**param) else: raise NotImplementedError # In current version of sklearn (0.21), it doesn't support OneVsRestClassifier + customized class weight. if self.args.class_weight_scheme == 'balanced': return OneVsRestClassifier(clf, n_jobs=self.args.n_jobs) else: return clf
def evaluate(classificationLabelFiles, runFile, ontologyFile, out_dir): # -------------------------------------------------- # TREC IS 2018 Evaluation Script # Used to evaluate TREC-IS runs # -------------------------------------------------- version = 1.0 # Notebook Version Number print_to_log("Start evaluation ...") # Configuration (Change this to match your setting) # System output file to evaluate: # runFile = "../out/predict.txt.gz" runName = "myrun" # The location of the ground truth data against which to compare the run # classificationLabelFiles = [ # "assr1.test", # "assr2.test", # "assr3.test", # "assr4.test", # "assr5.test", # "assr6.test" # ] # classificationLabelFiles = ['../data/TRECIS-2018-TestEvents-Labels/' + filename for filename in # classificationLabelFiles] # The location of the ontology file # ontologyFile = "../data/" + "ITR-H.types.v2.json" # -------------------------------------------------- # Static data for the 2018 edition # -------------------------------------------------- # Identifiers for the test events eventIdentifiers = [ "albertaFloods2013", "australiaBushfire2013", "bostonBombings2013", "chileEarthquake2014", "flSchoolShooting2018", "guatemalaEarthquake2012", "italyEarthquakes2012", "joplinTornado2011", "manilaFloods2013", "nepalEarthquake2015", "parisAttacks2015", "philipinnesFloods2012", "queenslandFloods2013", "typhoonHagupit2014", "typhoonYolanda2013" ] # Mapping of prority labels (by assessors) into numerical values [0-1] # We use this to calculate error against the participant priority scores priorityMapping = { "Critical": 1.0, "High": 0.75, "Medium": 0.5, "Low": 0.25 } resultsFile = open(os.path.join(out_dir, runName + ".results.overall.txt"), "w+") resultsFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) + "\n") resultsFile.write("Run: " + runName + " (" + runFile + ")" + "\n") resultsFile.write("" + "\n") perTopicFile = open( os.path.join(out_dir, runName + ".results.pertopic.txt"), "w+") perTopicFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) + "\n") perTopicFile.write("Run: " + runName + " (" + runFile + ")" + "\n") perTopicFile.write("" + "\n") perEventFile = open( os.path.join(out_dir, runName + ".results.perevent.txt"), "w+") perEventFile.write("TREC-IS 2018 Notebook Evaluator v" + str(version) + "\n") perEventFile.write("Run: " + runName + " (" + runFile + ")" + "\n") perEventFile.write("" + "\n") # -------------------------------------------------- # Processing Starts Here # -------------------------------------------------- import json from pprint import pprint import gzip # -------------------------------------------------- # Stage 1: Load the ground truth dataset # -------------------------------------------------- groundtruthJSON = [] for groundtruthFile in classificationLabelFiles: print_to_log("Reading " + groundtruthFile) with open(groundtruthFile, encoding='utf-8') as groundtruthJSONFile: groundtruthJSON.append(json.load(groundtruthJSONFile)) # pprint(groundtruthJSON["events"]) # -------------------------------------------------- # Stage 2: Load run file (assumes gzip) # -------------------------------------------------- with gzip.open(runFile, 'rb') as openRunFile: runContents = openRunFile.readlines() # lines not yet decoded # pprint(runContents[0]) # -------------------------------------------------- # Stage 3: Load the categories # -------------------------------------------------- with open(ontologyFile, encoding='utf-8') as ontologyJSONFile: ontologyJSON = json.load(ontologyJSONFile) informationTypes2Index = {} # category -> numerical index informationTypesShort2Index = { } # category short form (e.g. Report-EmergingThreats vs. EmergingThreats) -> numerical index informationTypeIndex = 0 for informationTypeJSON in ontologyJSON["informationTypes"]: informationTypeId = informationTypeJSON["id"] informationTypes2Index[informationTypeId] = informationTypeIndex informationTypesShort2Index[informationTypeId.split("-") [1]] = informationTypeIndex informationTypeIndex = informationTypeIndex + 1 # ----------------------------------------------------------- # Stage 4: Produce ground truth maps between tweetIds and categories # ----------------------------------------------------------- # Notes: Ground truth is used as a base, if a run includes tweets # not in the ground truth they will be ignored # Assumptions: A tweet will not be returned for multiple events tweetId2InfoCategories = { } # tweet id -> Array of categories selected by assessors tweetId2PriorityCategory = { } # tweet id -> priority label (Critical,High,Medium,Low) index2TweetId = {} # ordered tweets event2tweetIds = {} # event -> tweet ids for tweets within that event tweetIndex = 0 for groundtruth in groundtruthJSON: for eventJSON in groundtruth["events"]: eventid = eventJSON["eventid"] # two events were split and assessed in parts, re-name these so they are correctly read if eventid.endswith("S1") | eventid.endswith( "S2") | eventid.endswith("S3") | eventid.endswith("S4"): eventid = eventid[:-2] if not event2tweetIds.get(eventid): event2tweetIds[eventid] = [] if any(eventid in s for s in eventIdentifiers): # iterate over tweets in the event for tweetJSON in eventJSON["tweets"]: tweetid = tweetJSON["postID"] categories = tweetJSON["categories"] priority = tweetJSON["priority"] event2tweetIds[eventid].append(tweetid) # check categories for name issues and correct if possible for categoryId in categories: if not any( categoryId in s for s in informationTypesShort2Index.keys()): print_to_log("Found unknown category " + categoryId) tweetId2InfoCategories[tweetid] = categories tweetId2PriorityCategory[tweetid] = priority index2TweetId[tweetIndex] = tweetid tweetIndex = tweetIndex + 1 else: print_to_log( "WARN: Found ground truth data for event not in the 2018 topic set " + eventid + ", ignoring...", 'warning') # ----------------------------------------------------------- # Stage 5: Produce run predicted maps between tweetIds and categories # ----------------------------------------------------------- tweetId2RunInfoCategory = { } # tweet id -> predicted category by participant system tweetId2RunPriorityCategory = { } # tweet id -> importance score from participant system maxPrediction = -999999 minPrediction = 999999 for runLine in runContents: predictionParts = runLine.decode("utf-8").replace("\t", " ").split(" ") if (len(predictionParts) < 6): continue else: tweetId = predictionParts[2] category = predictionParts[5] priority = predictionParts[4] priorityNum = float(priority) if (maxPrediction < priorityNum): maxPrediction = priorityNum if (minPrediction > priorityNum): minPrediction = priorityNum tweetId2RunInfoCategory[tweetId] = category tweetId2RunPriorityCategory[tweetId] = priority # -------------------------------------------------- # Stage 6: Create ground truth vectors per category # -------------------------------------------------- category2GroundTruth = { } # category -> tweet vector with binary 1 vs all ground truth category labels for categoryId in informationTypes2Index.keys(): categoryIdShort = categoryId.split("-")[1] categoryVector = [] for i in range(len(index2TweetId)): tweetId = index2TweetId[i] categories = tweetId2InfoCategories.get(tweetId) # pprint(categories) if any(categoryIdShort in s for s in categories): categoryVector.append(1) else: categoryVector.append(0) category2GroundTruth[categoryId] = categoryVector # pprint(category2GroundTruth) # -------------------------------------------------- # Stage 7: Create run vectors per category # -------------------------------------------------- # Assumptions: If run misses a tweet, we assume id has # no categories category2Predicted = { } # category -> tweet vector with binary 1 vs all predicted by system labels for categoryId in informationTypes2Index.keys(): categoryVector = [] for i in range(len(index2TweetId)): tweetId = index2TweetId[i] if tweetId2RunInfoCategory.get(tweetId): category = tweetId2RunInfoCategory.get(tweetId) if category == categoryId: categoryVector.append(1) else: categoryVector.append(0) else: categoryVector.append(0) category2Predicted[categoryId] = categoryVector # pprint(category2Predicted) # -------------------------------------------------- # Stage 8: Make event category vectors # -------------------------------------------------- event2groundtruth = { } # event -> category -> tweet vector with binary 1 vs all ground truth category labels for eventId in eventIdentifiers: eventCategories = {} for categoryId in informationTypes2Index.keys(): categoryIdShort = categoryId.split("-")[1] categoryVector = [] for tweetId in event2tweetIds.get(eventId): category = tweetId2RunInfoCategory.get(tweetId) if category == categoryId: categoryVector.append(1) else: categoryVector.append(0) eventCategories[categoryId] = categoryVector event2groundtruth[eventId] = eventCategories event2prediction = { } # event -> category -> tweet vector with binary 1 vs all predicted by system labels for eventId in eventIdentifiers: eventCategories = {} for categoryId in informationTypes2Index.keys(): categoryIdShort = categoryId.split("-")[1] categoryVector = [] for tweetId in event2tweetIds.get(eventId): categories = tweetId2InfoCategories.get(tweetId) if any(categoryIdShort in s for s in categories): categoryVector.append(1) else: categoryVector.append(0) eventCategories[categoryId] = categoryVector event2prediction[eventId] = eventCategories # ----------------------------------------------------------- # Stage 9: Make priority classification vectors # ----------------------------------------------------------- category2GroundTruthPriority = { } # category -> tweet vector with binary 1 vs all ground truth priority labels for categoryId in informationTypes2Index.keys(): categoryIdShort = categoryId.split("-")[1] priorityVector = [] for i in range(len(index2TweetId)): tweetId = index2TweetId[i] categories = tweetId2InfoCategories.get(tweetId) if any(categoryIdShort in s for s in categories): priority = tweetId2PriorityCategory.get(tweetId) priorityAsNumber = priorityMapping[priority] priorityVector.append(priorityAsNumber) category2GroundTruthPriority[categoryId] = priorityVector category2PredictedPriority = { } # category -> tweet vector with binary 1 vs all predicted by system labels for categoryId in informationTypes2Index.keys(): categoryIdShort = categoryId.split("-")[1] categoryVector = [] for i in range(len(index2TweetId)): tweetId = index2TweetId[i] categories = tweetId2InfoCategories.get(tweetId) if any(categoryIdShort in s for s in categories): if tweetId2RunPriorityCategory.get(tweetId): priority = float(tweetId2RunPriorityCategory.get(tweetId)) # scale to between 0 and 1 if (maxPrediction - minPrediction) > 0: normPriority = (priority - minPrediction) / ( maxPrediction - minPrediction) else: normPriority = 0.0 # bound by min and max on ground truth if (normPriority < priorityMapping["Low"]): normPriority = priorityMapping["Low"] if (normPriority > priorityMapping["Critical"]): normPriority = priorityMapping["Critical"] categoryVector.append(normPriority) else: categoryVector.append( priorityMapping["Low"]) # default to low priority category2PredictedPriority[categoryId] = categoryVector # -------------------------------------------------- # EVALUATON 1: Information Type Categorization (Multi-type) # Overall performance # -------------------------------------------------- # Multi-type (1 vs All): Tweets have multiple information types, aim: predict all of them # Average performance over information types # Macro averaged (information types have equal weight) # Positive class is the target class from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score avgPrecision = 0.0 avgRecall = 0.0 avgF1 = 0.0 avgAccuracy = 0.0 for categoryId in informationTypes2Index.keys(): avgPrecision = avgPrecision + precision_score( category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary') avgRecall = avgRecall + recall_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary') avgF1 = avgF1 + f1_score(category2GroundTruth[categoryId], category2Predicted[categoryId], average='binary') avgAccuracy = avgAccuracy + accuracy_score( category2GroundTruth[categoryId], category2Predicted[categoryId]) print_to_log( "Information Type Precision (positive class, multi-type, macro): " + str(avgPrecision / len(informationTypes2Index))) print_to_log( "Information Type Recall (positive class, multi-type, macro): " + str(avgRecall / len(informationTypes2Index))) print_to_log("Information Type F1 (positive class, multi-type, macro): " + str(avgF1 / len(informationTypes2Index))) print_to_log("Information Type Accuracy (overall, multi-type, macro): " + str(avgAccuracy / len(informationTypes2Index))) resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write( "EVALUATON: Information Type Categorization (Multi-type)" + "\n") resultsFile.write("Overall performance" + "\n") resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write( "> Information Type Precision (positive class, multi-type, macro):" + "\t" + str(avgPrecision / len(informationTypes2Index)) + "\n") resultsFile.write( "> Information Type Recall (positive class, multi-type, macro):" + "\t" + str(avgRecall / len(informationTypes2Index)) + "\n") resultsFile.write( "> Information Type F1 (positive class, multi-type, macro):" + "\t" + str(avgF1 / len(informationTypes2Index)) + "\n") resultsFile.write( "> Information Type Accuracy (overall, multi-type, macro):" + "\t" + str(avgAccuracy / len(informationTypes2Index)) + "\n") resultsFile.write("" + "\n") # -------------------------------------------------- # EVALUATON 6: Information Priority Level # Overall Performance # -------------------------------------------------- # Average error over information types, lower is better # Macro average (categories have equal weight) from sklearn.metrics import mean_squared_error avgMSE = 0.0 for categoryId in informationTypes2Index.keys(): groundTruthPriorities = category2GroundTruthPriority[categoryId] predictedPriorities = category2PredictedPriority[categoryId] error = mean_squared_error(groundTruthPriorities, predictedPriorities) avgMSE = avgMSE + error print_to_log("Priority Estimation Error (mean squared error, macro): " + str(avgMSE / len(informationTypes2Index))) resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write("EVALUATON: Information Priority Level" + "\n") resultsFile.write("Overall Performance" + "\n") resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write( "> Priority Estimation Error (mean squared error, macro): " + str(avgMSE / len(informationTypes2Index)) + "\n") resultsFile.write("\n") # -------------------------------------------------- # EVALUATON 8: Information Type Categorization (Any-type) # Overall Performance (Micro Average) # -------------------------------------------------- # Any-type: Tweets have multiple information types, aim: predict any one of them # Categorization performance where a system gets full # score if it picked any of the information types that # the human assessor selected. # Micro Average (more common information types have higher weight) truePositive = 0 # system predicted any of the categories selected by the human assessor trueNegative = 0 # system and human assessor both selected either Other-Irrelevant or Other-Unknown falsePositive = 0 # system failed to predict any of the categories selected by the human assessor falseNegative = 0 # human assessor selected either Other-Irrelevant or Other-Unknown but the system prediced something different for i in range(len(index2TweetId)): tweetId = index2TweetId[i] groundTruthCategories = tweetId2InfoCategories.get(tweetId) predictedCategory = tweetId2RunInfoCategory.get(tweetId).split("-")[1] categoryMatchFound = False isNegativeExample = False if (predictedCategory == "Irrelevant") | (predictedCategory == "Unknown"): isNegativeExample = True for groundTruthCategory in groundTruthCategories: if (groundTruthCategory == predictedCategory): categoryMatchFound = True if categoryMatchFound & isNegativeExample: trueNegative = trueNegative + 1 if categoryMatchFound & (not isNegativeExample): truePositive = truePositive + 1 if (not categoryMatchFound) & isNegativeExample: falseNegative = falseNegative + 1 if (not categoryMatchFound) & (not isNegativeExample): falsePositive = falsePositive + 1 # print_to_log (str(truePositive)+" "+str(trueNegative)+" "+str(falsePositive)+" "+str(falseNegative)) precision = truePositive / (truePositive + falsePositive) recall = truePositive / (truePositive + falseNegative) print_to_log("Information Type Precision (any valid type, micro): " + str(precision)) print_to_log("Information Type Recall (any valid type, micro): " + str(recall)) f1 = 2 * ((precision * recall) / (precision + recall)) accuracy = (truePositive + trueNegative) / (truePositive + trueNegative + falsePositive + falseNegative) print_to_log("Information Type F1 (any valid type, micro): " + str(f1)) print_to_log("Information Type Accuracy (any valid type, micro): " + str(accuracy)) resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write("EVALUATON: Information Type Categorization (Any-type)" + "\n") resultsFile.write("Overall Performance (Micro Average)" + "\n") resultsFile.write("--------------------------------------------------" + "\n") resultsFile.write( "> Information Type Precision (any valid type, micro): " + str(precision) + "\n") resultsFile.write("> Information Type Recall (any valid type, micro): " + str(recall) + "\n") resultsFile.write("> Information Type F1 (any valid type, micro): " + str(f1) + "\n") resultsFile.write("> Information Type Accuracy (any valid type, micro): " + str(accuracy) + "\n") resultsFile.close() perTopicFile.close() perEventFile.close() print_to_log("All evaluation steps finished")