def parse_sentence(self, sentence: str, properties: Optional[Dict] = None): """ Run CoreNLP over a sentence. :param sentence: a single sentence :param properties: additional properties for CoreNLP :return: parsing result """ # The same input sentence can result in different annotations depending on the CoreNLP properties specified. # We therefore use a cache identifier for the sentence which includes the annotation properties. sent_cache_identifier = get_dict_hash( { "sentence": sentence, "properties": properties }, shorten=False) if not sent_cache_identifier in self.cache: # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in # a convenient format. A convenient format is the default format (protobuf-based), but that's not # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that. # When reading from the cache, we reassemble the protobuf object. req_properties = {"outputFormat": "serialized"} if properties is not None: req_properties.update(properties) doc = self.client.annotate(sentence, properties=req_properties) stream = writeToDelimitedString(doc) buf = stream.getvalue() stream.close() self.cache[sent_cache_identifier] = buf else: buf = self.cache[sent_cache_identifier] doc = Document() parseFromDelimitedString(doc, buf) return doc
def transform(self, X: Tuple): dataset, pairs, labels, unique_mentions = X if self.use_cache: # We want to cache feature transformation outputs similar to what is asked for / proposed here: # (1) https://mail.python.org/pipermail/scikit-learn/2017-August/001828.html # (2) https://gist.github.com/jnothman/019d594d197c98a3d6192fa0cb19c850 # We cannot implement the caching 1:1 as in the github gist because our feature extractors have constructor # parameters which change the output of transform(), i.e. we want one cache for each set of parameters. To # do this conveniently, we take the __dict__ of a feature extractor, remove irrelevant entries and hash the # result. Irrelevant entries are the features to select (read-only modification) and any data-dependent # attributes ending with an underscore (see https://scikit-learn.org/stable/developers/develop.html#estimated-attributes) attrs = copy.deepcopy(self.__dict__) attrs = {k:v for k,v in attrs.items() if not k.endswith("_") and not k in ["name", "features_to_select"]} cache_key = get_dict_hash(attrs) cache_location = Path(tempfile.gettempdir()) / f"feature_{self.name}_{cache_key}" memory = Memory(cache_location, verbose=0) feature_matrix = memory.cache(self._transform)(dataset, FeatureExtractorMixin.from_np_array_back_to_list_of_tuples(pairs), unique_mentions) else: feature_matrix = self._transform(dataset, FeatureExtractorMixin.from_np_array_back_to_list_of_tuples(pairs), unique_mentions) # filter feature matrix according to feature selection if self.features_to_select: all_feature_names = self._get_plain_names_of_all_features() # sanity check: we can only select what we can extract for fname in self.features_to_select: if not fname in all_feature_names: raise ValueError("Cannot select unknown feature name: " + fname) mask = np.array([fname in self.features_to_select for fname in all_feature_names]) filtered_feature_matrix = feature_matrix[:, mask] return filtered_feature_matrix else: return feature_matrix
def feature_selection(config_data: Dict, config_global: Dict, logger: Logger): """ Runs feature selection on the EVALUATION split. Uses 10 runs of 5-fold cross-validation for recursive feature elimination with a Random Forest mention classifier to find the most useful features. :param config_data: :param config_global: :param logger: :return: """ serialization_dir = config_global[RUN_WORKING_DIR] eval_data_path = config_data["eval_data_path"] oracle_mention_pair_generation = config_data["oracle_mention_pair_generation"] data = load_data(eval_data_path) X, y = get_X_and_y_for_pipeline(logger, data, doc_partitioning=None, oracle_mention_pair_generation=oracle_mention_pair_generation) config_base = { "classifier": {_TYPE: "RandomForest", _KWARGS: {"n_estimators": 100}}, "features": { "extractors": get_feature_extractors_config_with_all_and_defaults(), "selected_features": None }, "pairs": config_data["pairs"] } def run_rfecv_iteration(random_seed: int, n_splits: int = 6) -> Tuple[List[str], np.array, np.array]: # RFECV needs X to be an matrix-like of shape (n_samples, n_features). This means we cannot use our pipeline as is, # because our X's are not matrix-like. So we run our pipeline up to the point where we input the feature matrix + # labels into the mention pair classifier, and feed that to RFECV. To do that, we need to chop up the pipeline. config = copy.deepcopy(config_base) config["random_seed"] = random_seed pipeline, scoring = instantiate_pipeline(logger, config, with_clustering=False, scorer_should_return_single_scalar=True, serialization_dir=serialization_dir / "pipeline") # remove the classifier at the end of the pipeline classifier_wrapper = pipeline.steps.pop(-1)[1] # type: PredictOnTransformClassifierWrapper assert type(classifier_wrapper) is PredictOnTransformClassifierWrapper random_forest_clf = classifier_wrapper.classifier_ # obtain feature matrix and labels conflated_X = pipeline.fit_transform(X, y) actual_X, actual_y = classifier_wrapper._take_apart_X(conflated_X) cv = KFold(n_splits=n_splits, random_state=random_seed, shuffle=True) # We set min_impurity_decrease depending on the number of instances to obtain a useful feature selection result. # min_impurity_decrease was determined based on a series of manual experiments with a varying number of features # producing random and zero values. For 1e3 instances, values between 1e-7 and 1e-1 were tested, and 0.0015 # produced plots closest to the optimal expected result (i.e. significant peak around the number of non-garbage # features). Similar experiments were conducted for 1e4 and 1e5 instances. We interpolate between these data points. num_instances = len(actual_y) xp = np.log10([1e3, 1e5]) fp = np.log10([0.0015, 0.00025]) min_impurity_decrease = 10**np.interp(np.log10(num_instances), xp, fp) random_forest_clf.set_params(min_impurity_decrease=min_impurity_decrease) logger.info("Running feature selection...") selector = RFECV(estimator=random_forest_clf, n_jobs=config_global[MAX_CORES], cv=cv, scoring="f1_weighted", # use f1_weighted because we have very imbalanced data verbose=1) selector.fit(actual_X, actual_y) logger.info("Done.") feature_names = get_feature_names_from_pipeline(pipeline) support = selector.support_ grid_scores = selector.grid_scores_ assert len(support) == len(feature_names) return feature_names, support, grid_scores # When using oracle mention pair generation, a randomly determined subset of all mention pairs is used. This has a # big influence on the results. We therefore make sure run multiple RFECV iterations with different random seeds for # the mention pair generation and aggregate those. results = [] for seed in range(7): results.append(run_rfecv_iteration(seed)) feature_names, supports, grid_scores = list(zip(*results)) # assert that all results are compatible assert len(set(len(s) for s in supports)) == 1 assert len(set(get_dict_hash(fn) for fn in feature_names)) == 1 # collect selections in DataFrame selections = pd.DataFrame(np.vstack(supports).transpose(), index=pd.Index(feature_names[0], name="feature-name")) selected_features = selections.loc[selections.mean(axis=1) > 0.5].index.values # write to file(s) selections.to_csv(str(serialization_dir / "selected_features_unaggregated.csv")) with (serialization_dir / "selected_features.txt").open("w") as f: f.write("\n".join(selected_features)) logger.info("Selected features: " + "\n".join(selected_features)) # collect scores df_grid_scores = [] for m in grid_scores: # number of features and CV-score for that number of features x_and_y = np.vstack([np.arange(1, len(m) + 1), m]).transpose() df_grid_scores.append(x_and_y) df_grid_scores = pd.DataFrame(np.vstack(df_grid_scores)) df_grid_scores.columns = ["num-features", "weighted-f1"] df_grid_scores.to_csv(str(serialization_dir / "grid_scores.csv")) # plot feature selection results plot_destination = serialization_dir / "rfecv_plot.png" ax = sns.lineplot(x="num-features", y="weighted-f1", data=df_grid_scores) fig = ax.get_figure() fig.savefig(str(plot_destination))
def set_up_dir_structure(config): """ Preparation steps to get a directory structure like the following: <working_dir> ├── <config_name> │ └── <config_hash> │ ├── 00__<pipeline_stage_name> │ ├── 01__<other_pipeline_stage_name> │ ├── ... │ └── <timestamp of run> │ └── event.log └── global :param config: yaml config :return: """ config_id = get_dict_hash(config) # Load the configuration and store its own identifier and the current timestamp in the global config timestamp = time.strftime("%Y-%m-%d_%H-%M-%S") config_global = config["global"] config_global[ID] = config_id config_global[TIMESTAMP] = timestamp # use specified working dir, resolved against the working directory of the shell working_dir = Path.cwd() / Path(config_global["working_dir"]) # set up a shared working dir for all runs exists (for cached datasets etc.) global_working_dir = working_dir / "global" # set up a config-name (!) specific directory if a name is given - this is just for better grouping of runs config_name = config_global.get("config_name", "pipeline") config_name_working_dir = working_dir / get_filename_safe_string( config_name) # set up a config-specific directory config_working_dir = config_name_working_dir / config_id # set up a run-specific directory - some conditional changes depending on whether we run inside Slurm or not run_working_dir_filename_parts = [] config_global[TASK_ID] = None config_global[JOB_ID] = None config_global[JOB_ID_RAW] = None slurm_job_id = slurmee.get_job_id() job_array_info = slurmee.get_job_array_info() # if running on slurm: if slurm_job_id is not None: # use as many CPU cores as there are available config_global[MAX_CORES] = slurmee.get_cpus_on_node() # if running inside job array if job_array_info is not None: root_job_id = job_array_info["array_job_id"] task_id = job_array_info["task_id"] run_working_dir_filename_parts += [ str(root_job_id), f"{task_id:0>2}" ] config_global[JOB_ID] = root_job_id config_global[TASK_ID] = task_id config_global[JOB_ID_RAW] = root_job_id + task_id else: run_working_dir_filename_parts += [str(slurm_job_id)] config_global[JOB_ID] = slurm_job_id config_global[JOB_ID_RAW] = slurm_job_id run_working_dir_filename_parts += [timestamp] run_working_dir_filename = "_".join(run_working_dir_filename_parts) run_working_dir = config_working_dir / run_working_dir_filename # create folders for directory in [global_working_dir, run_working_dir]: directory.mkdir(parents=True, exist_ok=True) config_global[GLOBAL_WORKING_DIR] = global_working_dir config_global[CONFIG_NAME_WORKING_DIR] = config_name_working_dir config_global[CONFIG_WORKING_DIR] = config_working_dir config_global[RUN_WORKING_DIR] = run_working_dir # ...redirect destination for the logging file handler if "logging" not in config_global: config_global["logging"] = {} config_global["logging"]["path"] = run_working_dir / "log_events.log" return config
def instantiate_pipeline(logger: Logger, config: Dict, with_clustering: bool = False, use_caching: bool = False, scorer_should_return_single_scalar: bool = False, serialization_dir: Optional[Path] = None) -> Tuple[Pipeline, Callable]: """ Uses the entries of a config dictionary to instantiate a scikit-learn pipeline. Additionally returns the scoring function to use. :param logger: :param config: config dictionary :param with_clustering: If True, the pipeline will include agglomerative clustering, if False, only mention pair classification is done. The scoring function depends on this choice. :param use_caching: Whether fit() calls for all pipeline steps, and transform() calls for features should be cached. :param scorer_should_return_single_scalar: If True, the scoring function will return only a single metric as a scalar. This is useful for running cross-validation. If False, more metrics are returned as a pd.Series. :param serialization_dir: optional serialization dir, only used for debugging :return: sklearn pipeline and the scoring function to use for evaluation """ random_seed = config.pop("random_seed") random_state = check_random_state(random_seed) pairs_config = config.pop("pairs") feature_extractors_config = config.pop("features") classifier_config = config.pop("classifier") # We make use of joblib's caching feature implemented for pipelines in sklearn. joblib only checks if it has seen # a pipeline transformer's function arguments before, so we need to make sure to create separate caches when # mention pair sampler, feature or classifier config parameters are changed. We use config dict hashes for that. if use_caching: config_hashes = [get_dict_hash(pairs_config), get_dict_hash(feature_extractors_config)] if with_clustering: # clustering additionally depends on the classifier config config_hashes += [get_dict_hash(classifier_config)] pipeline_cache = Path(tempfile.gettempdir()) / ("pipeline_" + "_".join(config_hashes)) memory = str(pipeline_cache) else: memory = None # instantiate some bits feature_extractors = instantiate_feature_extractors(feature_extractors_config, use_caching) classifier, classifier_fit_params = instantiate_classifier(classifier_config, random_state) # instantiate mention pair generator stage, which shares parameters with the mention pair scorer (if we use that) mpg_training_config = pairs_config.pop("mpg_training") mpg_prediction_config = pairs_config.pop("mpg_prediction") if pairs_config: raise ValueError("Leftover 'pairs' config entries: " + pprint.pformat(pairs_config)) if with_clustering and mpg_prediction_config is not None: # Reasoning: Our mention pair generation parameters only affect the number and distribution of pairs, the # number of distribution of mentions is unchanged. Tweaking the mention pair generation process is therefore # only useful when the evaluation directly on the pairs, not on the mentions. For clustering, we evaluate based # on mentions, and we need distances between all mention pairs, therefore it does not make any sense to use # tweaked mention pair sampling at prediction time. raise ValueError("'mpg_prediction' cannot be used with clustering!") pair_generation_stage = MentionPairGeneratorStage(mpg_training_config, mpg_prediction_config, random_state=random_state, serialization_dir=serialization_dir / "mpg" if serialization_dir is not None else None) if with_clustering: # using only the most discriminating metric (LEA) is faster when running cross-validation scorer = CrossDocCorefScoring(only_lea_f1_for_cv=scorer_should_return_single_scalar) else: scorer = MentionPairScoring(mpg_prediction_config, return_neg_log_loss_for_cv=scorer_should_return_single_scalar, serialization_dir=serialization_dir / "scorer" if serialization_dir is not None else None) # Now it's time to assemble the pipeline. # For reference, this is the sequence of calls on a pipeline with stages [a, b]: # training: # a: fit called # a: transform # b: fit called # estimating: # a: transform called # b: predict called # Combine all feature extractors in a feature union, remove mean and normalize variance. feature_extraction_pipeline_steps = [ ("features", FeatureUnion(feature_extractors)), ("scaling", StandardScaler()) ] # This section merges the boolean label of each mention pair with the feature matrix and passes it to the # classifier. mention_pair_distance_pipeline_steps = [ ("join_labels_and_feature_matrix", FeatureUnion([ ("get_labels_from_X", FunctionTransformer(get_mention_pair_labels_from_X)), ("feature_extraction", Pipeline(feature_extraction_pipeline_steps)), ])), (CLASSIFIER_PIPELINE_STEP_NAME, PredictOnTransformClassifierWrapper(classifier, classifier_fit_params)), ] if with_clustering: # create clustering step clustering_config = config.pop("clustering") clustering = ScipyClustering.from_params(clustering_config) # When clustering, we start with generating pairs, then we classify those pairs (see above), but we additionally # need to retain the two mention identifiers of each mention pair. Mention pair identifiers and their distance # (between 0 and 1) are merged with a FeatureUnion. This "feature matrix" is pulled apart in the clustering step # where mentions are clustered agglomeratively according to their pairwise distances. pipeline = Pipeline([ ("pair_generation", pair_generation_stage), ("mention_pair_distance_with_identifiers", FeatureUnion([ ("get_mention_pair_identifiers_from_X", FunctionTransformer(get_mention_pair_identifiers_from_X)), ("mention_pair_distance", Pipeline(mention_pair_distance_pipeline_steps)) ])), (CLUSTERING_PIPELINE_STEP_NAME, clustering) ], memory=memory) else: if "clustering" in config: logger.warning("Clustering configuration will not be used.") config.pop("clustering") # In the simplified case, we only need to pass the generated mention pairs to the classification pipeline part. pipeline = Pipeline([ ("pair_generation", pair_generation_stage), *mention_pair_distance_pipeline_steps ], memory=memory) if config: raise ValueError("Leftover config entries: " + pprint.pformat(config)) return pipeline, scorer
def perform_prediction_analysis(dataset: Dataset, outcomes: List[pd.DataFrame], num_samples_per_quadrant: int, serialization_dir: Path) -> None: """ Given outcomes from mention pair classifications, computes detailed confusion matrices per link type. Also picks one run and samples several instances for each quadrant of the 2x2 confusion matrix and prints those for manual analysis. :param dataset: evaluation dataset :param outcomes: list of dataframe containing evaluated pairs with predicted and gold label, one for each run :param num_samples_per_quadrant: number of instances sampled per confusion matrix quadrant :param serialization_dir :return: """ # assert that all passed outcome dataframes are compatible df_lengths = [len(df) for df in outcomes] assert len(set(df_lengths)) == 1 # check sameness of a-doc-ids and b-mention-ids, if one of those two mismatches we have a problem anyway a_doc_id_hashes = [get_dict_hash(df[IDX_A_DOC].values) for df in outcomes] b_mention_id_hashes = [ get_dict_hash(df[IDX_B_MENTION].values) for df in outcomes ] assert len(set(a_doc_id_hashes)) == 1 assert len(set(b_mention_id_hashes)) == 1 # All dataframes contain the same mention indices of each mention. We just need to keep this once, then we can throw # away mention indices for the outcomes of each run. index_df = outcomes[0][[ IDX_A_DOC, IDX_A_MENTION, IDX_B_DOC, IDX_B_MENTION ]].copy() for outcomes_df in outcomes: outcomes_df.drop( columns=[IDX_A_DOC, IDX_A_MENTION, IDX_B_DOC, IDX_B_MENTION], inplace=True) # In the mention pair index dataframe, label each pair with its type: cross-topic, cross-subtopic, # within-subtopic, within-document. # First, convert docs to usable format: docs = dataset.documents docs = pd.concat([ docs.index.to_frame()[[TOPIC_ID, SUBTOPIC]].reset_index(drop=True), docs[DOCUMENT_ID].reset_index(drop=True) ], axis=1) # Merging resets the index to the default. We want to keep it intact, so that we can concat index_df and the # outcomes again later. index_df_index = index_df.index index_df = index_df.merge(docs, left_on=IDX_A_DOC, right_on=DOCUMENT_ID, how="left") index_df = index_df.drop(columns=[DOCUMENT_ID]).rename( columns={ TOPIC_ID: "a-topic-id", SUBTOPIC: "a-subtopic" }) index_df = index_df.merge(docs, left_on=IDX_B_DOC, right_on=DOCUMENT_ID, how="left") index_df = index_df.drop(columns=[DOCUMENT_ID]).rename( columns={ TOPIC_ID: "b-topic-id", SUBTOPIC: "b-subtopic" }) index_df.index = index_df_index topic_match = (index_df["a-topic-id"] == index_df["b-topic-id"]) subtopic_match = (index_df["a-subtopic"] == index_df["b-subtopic"]) document_match = (index_df[IDX_A_DOC] == index_df[IDX_B_DOC]) index_df.loc[~topic_match, PAIR_TYPE] = CT index_df.loc[topic_match & ~subtopic_match, PAIR_TYPE] = CS index_df.loc[topic_match & subtopic_match & ~document_match, PAIR_TYPE] = WS index_df.loc[topic_match & subtopic_match & document_match, PAIR_TYPE] = WD # For each run, label each pair with true positive, false positive, etc. for outcome_df in outcomes: outcome_df.loc[outcome_df[LABEL] & outcome_df[PREDICTION], QUADRANT] = TP outcome_df.loc[outcome_df[LABEL] & ~outcome_df[PREDICTION], QUADRANT] = FN outcome_df.loc[~outcome_df[LABEL] & outcome_df[PREDICTION], QUADRANT] = FP outcome_df.loc[~outcome_df[LABEL] & ~outcome_df[PREDICTION], QUADRANT] = TN _create_confusion_matrices(index_df, outcomes, serialization_dir) _print_prediction_pairs(index_df, outcomes[0], dataset, num_samples_per_quadrant, serialization_dir)