def load_stems(self, data): """Returns all word stems used in the parsed XML data.""" # Get all word stems stems = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) stems = np.append(stems, [f.get_stem_target()]) stems = np.append(stems, [f.get_stem_source()]) stems = np.unique(stems) return stems
def parse_Features(data, new=False, annotations="union", features=["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"], distance=False): """Extracts the features out of the dataset and returns a list of features with the corresponding classes. Args: data (list): The parsed data from fables-100-temporal-dependency.xml. new (bool): With new=True a new calculation of Pos() and Stem() can be enforced. Otherwise it will be loaded from a file. annotations (str): Looking on all relations ("union") or at all relations in common between the annotators ("intersected"). features (list): Determines which features should be activated. Possible values: "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality". distance (bool): If set to True parse_Features() will return distance information for the data (needed for evaluation) """ # Only compute pos and stem if new flag is set if "pos" in features or "stem" in features: if new or not os.path.isfile("set.p"): pos = Pos(data, 6, annotations) stem = Stem(data, annotations) pickle.dump((pos, stem), open("save.p", "wb")) else: pos, stem = pickle.load(open("save.p", "rb")) if distance: distance_diff = [] X = [] y = np.array([], dtype=int) for txt in data.textfiles: # Union or intersected relations? if annotations == "union": txt.compute_union_relations() elif annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) feature = [] # Make polarity feature if "polarity" in features: feature = np.concatenate((feature, [f.get_polarity()])) # Make distance feature if "distance" in features: feature = np.concatenate((feature, f.get_distance())) # Make POS feature if "pos" in features: pos_feature = pos.transform(f.get_pos_target(), f.get_pos_source()) pos_feature = pos_feature.toarray()[0] feature = np.concatenate((feature, pos_feature)) # Make Stem feature if "stem" in features: stem_feature = stem.transform(f.get_stem_source(), f.get_stem_target()) stem_feature = stem_feature[0] feature = np.concatenate((feature, stem_feature)) # Make similarity feature if "similarity" in features: feature = np.concatenate((feature, [f.get_similarity_of_words()])) # Make modality feature if "modality" in features: feature = np.concatenate((feature, [f.get_modality()])) # Make aspect feature if "aspect" in features: feature = np.concatenate((feature, f.get_aspect())) # Make tense feature if "tense" in features: feature = np.concatenate((feature, f.get_tense())) # Append feature to X X.append(feature) y = np.append(y, [f.get_class()]) # Append distance information if needed if distance: distance_diff.append(f.get_distance_diff()) if distance: return (X, y, distance_diff) else: return (X, y)