def load_integer_features(self, data): """Gives each POS tag in data a number.""" integer_features = [] pos_feature = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Build arrays of integers with which we can fit the encoder # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2 standardized_pos_target = self.standardize_sub_pos_feature(f.get_pos_target()) standardized_pos_source = self.standardize_sub_pos_feature(f.get_pos_source()) # Concatenate the two plain POS tag arrays from target and source event pos_feature = np.concatenate((standardized_pos_target, standardized_pos_source)) # Transform this array into the corresponding array of integers integer_feature = self.pos_tags_to_integers(pos_feature) integer_features.append(integer_feature) return integer_features
def load_integer_features(self, data): """Gives each POS tag in data a number.""" integer_features = [] pos_feature = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Build arrays of integers with which we can fit the encoder # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2 standardized_pos_target = self.standardize_sub_pos_feature( f.get_pos_target()) standardized_pos_source = self.standardize_sub_pos_feature( f.get_pos_source()) # Concatenate the two plain POS tag arrays from target and source event pos_feature = np.concatenate( (standardized_pos_target, standardized_pos_source)) # Transform this array into the corresponding array of integers integer_feature = self.pos_tags_to_integers(pos_feature) integer_features.append(integer_feature) return integer_features
def load_pos_tags(self, data): """Loads all POS tags used in the pos_surrounding area around an event.""" pos_tags = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Collect all pos tags from the data pos_tags = np.concatenate((pos_tags, f.get_pos_target())) pos_tags = np.concatenate((pos_tags, f.get_pos_source())) pos_tags = np.unique(pos_tags) # Append a blank tag which will be used for filling up features which don't have enough elements pos_tags = np.append(pos_tags, 'BL') return pos_tags
def parse_Features(data, new=False, annotations="union", features=["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"], distance=False): """Extracts the features out of the dataset and returns a list of features with the corresponding classes. Args: data (list): The parsed data from fables-100-temporal-dependency.xml. new (bool): With new=True a new calculation of Pos() and Stem() can be enforced. Otherwise it will be loaded from a file. annotations (str): Looking on all relations ("union") or at all relations in common between the annotators ("intersected"). features (list): Determines which features should be activated. Possible values: "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality". distance (bool): If set to True parse_Features() will return distance information for the data (needed for evaluation) """ # Only compute pos and stem if new flag is set if "pos" in features or "stem" in features: if new or not os.path.isfile("set.p"): pos = Pos(data, 6, annotations) stem = Stem(data, annotations) pickle.dump((pos, stem), open("save.p", "wb")) else: pos, stem = pickle.load(open("save.p", "rb")) if distance: distance_diff = [] X = [] y = np.array([], dtype=int) for txt in data.textfiles: # Union or intersected relations? if annotations == "union": txt.compute_union_relations() elif annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) feature = [] # Make polarity feature if "polarity" in features: feature = np.concatenate((feature, [f.get_polarity()])) # Make distance feature if "distance" in features: feature = np.concatenate((feature, f.get_distance())) # Make POS feature if "pos" in features: pos_feature = pos.transform(f.get_pos_target(), f.get_pos_source()) pos_feature = pos_feature.toarray()[0] feature = np.concatenate((feature, pos_feature)) # Make Stem feature if "stem" in features: stem_feature = stem.transform(f.get_stem_source(), f.get_stem_target()) stem_feature = stem_feature[0] feature = np.concatenate((feature, stem_feature)) # Make similarity feature if "similarity" in features: feature = np.concatenate((feature, [f.get_similarity_of_words()])) # Make modality feature if "modality" in features: feature = np.concatenate((feature, [f.get_modality()])) # Make aspect feature if "aspect" in features: feature = np.concatenate((feature, f.get_aspect())) # Make tense feature if "tense" in features: feature = np.concatenate((feature, f.get_tense())) # Append feature to X X.append(feature) y = np.append(y, [f.get_class()]) # Append distance information if needed if distance: distance_diff.append(f.get_distance_diff()) if distance: return (X, y, distance_diff) else: return (X, y)