def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces the field content for this representation, bag of features whose keys is babel net synset id and values are global score of the sysnset Args: field_representation_name (str): Name of the field representation field_data: Text that will be linked to BabelNet Returns: feature_bag (FeaturesBagField) """ field_data = check_not_tokenized(field_data) self.__babel_client.babelfy(field_data) feature_bag = FeaturesBagField(field_representation_name) try: if self.__babel_client.entities is not None: try: for entity in self.__babel_client.entities: feature_bag.append_feature(entity['babelSynsetID'], entity['globalScore']) except AttributeError: pass except AttributeError: pass return feature_bag
def test_get_feature_dict(self): feature = FeaturesBagField('repr_name') feature.append_feature('synsetID_1', 'global_score_1') feature.append_feature('synsetID_2', 'global_score_2') self.assertEqual(feature.value, { 'synsetID_1': 'global_score_1', 'synsetID_2': 'global_score_2' }, "Error in the features_dict")
def test_load_serialize(self): content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field = ContentField("test_field", "0000") content_field.append(str(0), content_field_repr) content = Content("001") content.append("test_field", content_field) try: content.serialize(".") except FileNotFoundError: self.fail("Could not create file!") with lzma.open('001.xz', 'r') as file: self.assertEqual(content, pickle.load(file))
def produce_content(self, field_representation_name: str, content_id: str, field_name: str): """ Retrieve the tf-idf values, for terms in document that match with content_id, from the pre-computed word - document matrix. Args: field_representation_name (str): Name of the field representation content_id (str): Id of the content that contains the terms for which extract the tf-idf field_name (str): Name of the field to consider Returns: (FeaturesBag): <term, tf-idf> """ doc = self.__matching[content_id] feature_index = self.__tfidf_matrix[doc, :].nonzero()[1] tfidf_scores = zip( feature_index, [self.__tfidf_matrix[doc, x] for x in feature_index]) features = {} for word, score in [(self.__feature_names[i], score) for (i, score) in tfidf_scores]: features[word] = score return FeaturesBagField(field_representation_name, features)
def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the field data text """ field_data = check_not_tokenized(field_data) synsets = disambiguate(field_data) synsets = [synset for word, synset in synsets if synset is not None] return FeaturesBagField(field_representation_name, Counter(synsets))
def test_append_remove(self): content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field = ContentField("test_field", "0000") content_field.append(str(0), content_field_repr) content1 = Content("001") content1.append("test_field", content_field) content2 = Content("002") content2.append("test_field", content_field) content_field_repr = FeaturesBagField("test") content_field_repr.append_feature("test_key", "test_value") content_field2 = ContentField("test_field2", "0000") content_field2.append(str(0), content_field_repr) content2.append("test_field2", content_field2) content2.remove("test_field2") self.assertTrue(content1.get_field_list(), content2.get_field_list())
def __decode_field_data(self, field: ContentField, field_name: str, field_data: str): # Decode string into dict or list try: loaded = json.loads(field_data) except json.JSONDecodeError: try: # in case the dict is {'foo': 1} json expects {"foo": 1} reformatted_field_data = field_data.replace("\'", "\"") loaded = json.loads(reformatted_field_data) except json.JSONDecodeError: # if it has issues decoding we consider the data as str loaded = reformatted_field_data # if the decoded is a list, maybe it is an EmbeddingField repr if isinstance(loaded, list): arr = np.array(loaded) # if the array has only numbers then we consider it as a dense vector # else it is not and we consider the field data as a string if issubclass(arr.dtype.type, np.number): result = EmbeddingField(field_name, arr) field.append(field_name, result) else: result = StringField(field_name, field_data) field.append(field_name, result) # if the decoded is a dict, maybe it is a FeaturesBagField elif isinstance(loaded, dict): # if all values of the dict are numbers then we consider it as a bag of words # else it is not and we consider it as a string if len(loaded.values()) != 0 and \ all(isinstance(value, (float, int)) for value in loaded.values()): result = FeaturesBagField(field_name, loaded) field.append(field_name, result) else: result = StringField(field_name, field_data) field.append(field_name, result) # if the decoded is a string, then it is a StringField elif isinstance(loaded, str): result = StringField(field_name, loaded) field.append(field_name, result)
def produce_content(self, field_representation_name: str, content_id: str, field_name: str) -> FeaturesBagField: return FeaturesBagField( field_representation_name, self.__index.get_tf_idf(field_name, content_id))
def test_append_get_feature(self): feature = FeaturesBagField('repr_name') feature.append_feature('synsetID', 'global_score') self.assertEqual(feature.get_feature('synsetID'), 'global_score', "Error in the features_dict")