def execute(self): logging.info("started extracting word_embeddings feature generator:") for counter, target_author_word_embeddings_dict in enumerate( self._targeted_author_word_embeddings): targeted_table = target_author_word_embeddings_dict["table_name"] targeted_field_name = target_author_word_embeddings_dict[ "targeted_field_name"] targeted_word_embedding_type = target_author_word_embeddings_dict[ "word_embedding_type"] targeted_word_embeddings_combination = targeted_table + "_" + targeted_field_name + "_" + targeted_word_embedding_type logging.info("currently extracting features of " + targeted_word_embeddings_combination + ": " + str(counter + 1) + " out of " + str(len(self._targeted_author_word_embeddings))) author_guid_word_embeding_dict = self.load_author_guid_word_embedding_dict( targeted_field_name, targeted_table, targeted_word_embedding_type) Vector_Operations.create_features_from_word_embedding_dict( author_guid_word_embeding_dict, targeted_table, targeted_field_name, targeted_word_embedding_type, self._word_embedding_table_name, self._window_start, self._window_end, self._db, self._max_objects_without_saving, self.__class__.__name__ + '_')
def execute(self): logging.info("started extracting word_embbeddings feature generator:") counter = 0 authors_features = [] for target_author_word_embeddings_dict in self._targeted_author_word_embeddings: counter += 1 targeted_table = target_author_word_embeddings_dict["table_name"] targeted_field_name = target_author_word_embeddings_dict[ "targeted_field_name"] targeted_word_embedding_type = target_author_word_embeddings_dict[ "word_embedding_type"] targeted_word_embeddings_combination = targeted_table + "_" + targeted_field_name + "_" + targeted_word_embedding_type logging.info("currently extracting features of " + targeted_word_embeddings_combination + ": " + str(counter) + " out of " + str(len(self._targeted_author_word_embeddings))) author_guid_word_embeding_dict = self._db.get_author_guid_word_embedding_vector_dict( targeted_table, targeted_field_name, targeted_word_embedding_type) Vector_Operations.create_features_from_word_embedding_dict( author_guid_word_embeding_dict, targeted_table, targeted_field_name, targeted_word_embedding_type, self._window_start, self._window_end, self._db, self._max_objects_without_saving)
def execute(self): i = 0 for connection in self._connection_types: i += 1 first_field = connection[0] first_table_name = first_field["table_name"] first_targeted_field_name = first_field["targeted_field_name"] first_word_embedding_type = first_field["word_embedding_type"] second_field = connection[1] second_table_name = second_field["table_name"] second_targeted_field_name = second_field["targeted_field_name"] second_word_embedding_type = second_field["word_embedding_type"] print( '\r {0}/{1} Current connection:{2}_{3}_{4}-{5}_{6}_{7}'.format( i, len(self._connection_types), first_table_name, first_targeted_field_name, first_word_embedding_type, second_table_name, second_targeted_field_name, second_word_embedding_type), end='') first_author_guid_word_embedding_vector_dict = self._db.get_author_guid_word_embedding_vector_dict( first_table_name, first_targeted_field_name, first_word_embedding_type) second_author_guid_word_embedding_vector_dict = self._db.get_author_guid_word_embedding_vector_dict( second_table_name, second_targeted_field_name, second_word_embedding_type) for function in self._similarity_functions: if function == "subtruct_and_split": authors_features = Vector_Operations.create_subtruction_dimension_features_from_authors_dict( first_author_guid_word_embedding_vector_dict, second_author_guid_word_embedding_vector_dict, first_table_name, first_targeted_field_name, first_word_embedding_type, second_table_name, second_targeted_field_name, second_word_embedding_type, self._window_start, self._window_end) else: authors_features = Vector_Operations.create_authors_feature_from_two_vectors( function, first_author_guid_word_embedding_vector_dict, second_author_guid_word_embedding_vector_dict, first_table_name, first_targeted_field_name, first_word_embedding_type, second_table_name, second_targeted_field_name, second_word_embedding_type, self._window_start, self._window_end) self.insert_author_features_to_db( authors_features) # create in batches
def execute(self, window_start=None): self._word_vector_dict = self._db.get_word_vector_dictionary(self._table_name) for target1, target2 in self._pairs_targets: features = [] target1_tuples = self._get_records_by_target_dict(target1) target2_tuples = self._get_records_by_target_dict(target2) for id in target1_tuples: dif_set1, dif_set2 = self.get_word_differences(id, target1_tuples, target2_tuples) for aggregation_function in self._aggregation_functions: try: dif1_word_embedding, dif2_word_embedding, subtraction_vec = self._get_differential_vectors( aggregation_function, dif_set1, dif_set2) feature_name = self._get_feature_names(target1, target2, aggregation_function) features = features + Vector_Operations.create_author_feature_for_each_dimention( subtraction_vec, feature_name, id, self._window_start, self._window_end, self.__class__.__name__ + '_') features = features + self.create_distance_features(id, aggregation_function, dif1_word_embedding, dif2_word_embedding, target1, target2, self.__class__.__name__ + '_') except Exception as e1: logging.info(e1) self._db.add_author_features(features)
def create_distance_features(self, author_id, aggregation_function, word_embedding_vector1, dif2_word_embedding, target1, target2, prefix=u''): distance_features = [] for distance_function in self._distance_functions: feature_name = prefix + u'differential_' + u"distance_function_" + distance_function + '_' + target1[ 'table_name'] + "_" + target1['targeted_field_name'] + "_" + str(aggregation_function) + "_TO_" \ + target2['table_name'] + "_" + target2['targeted_field_name'] + "_" + str( aggregation_function) attribute_value = Vector_Operations.oparate_on_two_vectors(commons, distance_function, word_embedding_vector1, dif2_word_embedding) feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id, attribute_value, self._window_start, self._window_end) distance_features.append(feature) return distance_features
def execute(self, window_start=None): for targeted_fields_dict in self._targeted_fields_for_embedding: source_id_target_elements_dict = self._get_source_id_target_elements( targeted_fields_dict) source_id_document_tuples = self._create_documents( source_id_target_elements_dict) msg = "\rStarting training doc2vec" print(msg, end='') model = self._train_doc2vec_model(source_id_document_tuples) msg = "\rFinishing training doc2vec" print(msg, end='') targeted_table = targeted_fields_dict['source']['table_name'] targeted_field_name = targeted_fields_dict['destination']['table_name'] + "-" \ + targeted_fields_dict['destination']['target_field'] model_type = "{0}_dimensions_{1}_window_size".format( self._num_of_dimensions, self._window_size_doc2vec) source_ids = source_id_target_elements_dict.keys() counter = 0 authors_features = [] for i, source_id in enumerate(source_ids): msg = "\rExtracting doc2vec features: {0}/{1}:{2}".format( i, len(source_ids), source_id) print(msg, end="") counter += 1 if counter % self._max_objects_without_saving == 0: self._db.add_author_features(authors_features) self._db.session.commit() source_id_vector = model[source_id] feature_name = model_type + '_' + targeted_table + "_" + targeted_field_name dimentions_feature_for_author = Vector_Operations.create_author_feature_for_each_dimention( source_id_vector, feature_name, source_id, self._window_start, self._window_end, self._prefix) authors_features = authors_features + dimentions_feature_for_author self._db.add_author_features(authors_features) self._db.session.commit()