def create_distance_features(self, author_id, aggregation_function, word_embedding_vector1, dif2_word_embedding, target1, target2): distance_features = [] for distance_function in self._distance_functions: feature_name = u'word_embeddings_differential_' + u"distance_function_" + distance_function + '_' + target1[ 'table_name'] + "_" + target1['targeted_field_name'] + "_" + str(aggregation_function) + "_TO_" \ + target2['table_name'] + "_" + target2['targeted_field_name'] + "_" + str( aggregation_function) attribute_value = Vector_Operations.oparate_on_two_vectors(commons, distance_function, word_embedding_vector1, dif2_word_embedding) feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id, attribute_value, self._window_start, self._window_end) distance_features.append(feature) return distance_features
def create_authors_feature_from_two_vectors(func, first_author_vector_dict, second_author_vector_dict, first_table_name, first_targeted_field_name, first_word_embedding_type, second_table_name, second_targeted_field_name, second_word_embedding_type, window_start, window_end, prefix=u''): authors_features = [] for author_id in first_author_vector_dict.keys(): feature_name = prefix + u'subtraction_'+first_table_name + "_" + first_targeted_field_name + "_" + first_word_embedding_type + "_TO_" \ + second_table_name + "_" + second_targeted_field_name + "_" + second_word_embedding_type + "_DISTANCE-FUNCTION_" + func first_vector = first_author_vector_dict[author_id] second_vector = second_author_vector_dict[author_id] # attribute_value = getattr(commons.commons, func)(first_vector, second_vector attribute_value = Vector_Operations.oparate_on_two_vectors(commons.commons, func, first_vector, second_vector) feature = BaseFeatureGenerator.create_author_feature(feature_name, author_id, attribute_value, window_start, window_end) authors_features.append(feature) return authors_features
def calc_avg_known_words(self, source_id, **kwargs): destination_target_fields = kwargs['target'] features = [] logging.info("processing author " + source_id) for word_list_name in self.word_lists_names: self._load_known_words_to_dict(word_list_name) try: result = self._count_avg_known_words(destination_target_fields) window_start = self._window_start window_end = self._window_end attribute_name = str(self.__class__.__name__ + '_count_avg_known_word_from_' + word_list_name) author_feature = BaseFeatureGenerator.create_author_feature( attribute_name, source_id, result, window_start, window_end) features.append(author_feature) except Exception as e1: info_msg = e1.message logging.error(info_msg + word_list_name) return features
def execute(self, window_start=None): # Logger setup. start_time = time.time() info_msg = "execute started for Cooperation topic feature generator started at " + str( start_time) logging.info(info_msg) claims = self._db.get_claims() logging.info("Cooperation execute window_start %s" % self._window_start) try: # Claims => for each claim, get Posts. => for all posts, create Id to post words dictionary # includes stamming, stopwords. => 'calculate_topics' creates the bacg of words. claim_features = [] posts_dict = self._db.get_claim_id_posts_dict() for cnt, claim in enumerate(claims): claim_id = claim.claim_id logging.info('Started ' + str(cnt + 1) + ' claim from ' + str(len(claims)) + ' claims') posts_list = posts_dict[claim_id] if len(posts_list) == 0: logging.info('The resulted list is empty for claim: ' + str(claim_id)) continue post_id_to_words = self._create_post_id_to_content_words( posts_list) post_id_to_strings_no_urls = self._create_post_id_to_strings_no_urls( posts_list) authors_counter_dic1 = self.calculate_topics_similarity( post_id_to_words) authors_counter_dic2 = self.calculate_topics_exact_match( post_id_to_strings_no_urls) for ftr, feature_name in enumerate(self._features_list): logging.info('Started ' + str(ftr + 1) + ' feature from ' + str(len(self._features_list)) + ' features') try: attribute_value1 = float( getattr(self, feature_name)(authors_counter_dic1)) attribute_value2 = float( getattr(self, feature_name)(authors_counter_dic2)) except: attribute_value1 = -1.0 attribute_value2 = -1.0 print('Fail in extraction: ' + feature_name) if attribute_value1 is not None and attribute_value2 is not None: attribute_name1 = "{0}_{1}".format( self._prefix, feature_name) attribute_name2 = "{0}_{1}".format( self._prefix, "exact_match_" + feature_name) # next line add envelope for feature claim_feature1 = BaseFeatureGenerator.create_author_feature( attribute_name1, claim_id, attribute_value1, self._window_start, self._window_end) claim_feature2 = BaseFeatureGenerator.create_author_feature( attribute_name2, claim_id, attribute_value2, self._window_start, self._window_end) claim_features.append(claim_feature1) claim_features.append(claim_feature2) print('Appended: ' + attribute_name1) print('Appended: ' + attribute_name2) for ftr, feature_name in enumerate( self._author_count_features_list): logging.info('Started ' + str(ftr + 1) + ' feature from ' + str(len(self._author_count_features_list)) + ' features') attribute_name1 = "{0}_{1}".format(self._prefix, feature_name) attribute_name2 = "{0}_{1}".format( self._prefix, "exact_match_" + feature_name) for author_id in authors_counter_dic1: attribute_value1 = authors_counter_dic1[author_id] attribute_value2 = authors_counter_dic2[author_id] if attribute_value1 is not None and attribute_value2 is not None: author_feature1 = BaseFeatureGenerator.create_author_feature( attribute_name1, author_id, attribute_value1, self._window_start, self._window_end) author_feature2 = BaseFeatureGenerator.create_author_feature( attribute_name2, author_id, attribute_value2, self._window_start, self._window_end) claim_features.append(author_feature1) claim_features.append(author_feature2) except Exception as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) print(message) logging.error('Failed in extraction process!') stop_time = time.time() info_msg = "execute ended at " + str(stop_time) logging.info(info_msg) self._db.add_author_features(claim_features)
def __init__(self, db, **kwargs): BaseFeatureGenerator.__init__(self, db, **kwargs) self._connection_types = self._config_parser.eval( self.__class__.__name__, "connection_types") self._similarity_functions = self._config_parser.eval( self.__class__.__name__, "similarity_functions")
def __init__(self, db, **kwargs): BaseFeatureGenerator.__init__(self, db, **kwargs) self._post_id_text_image_dict = self._create_post_id_text_image()
def __init__(self, db, **kwargs): BaseFeatureGenerator.__init__(self, db, **{'authors': [], 'posts': {}}) self._features = self._config_parser.eval(self.__class__.__name__, "feature_list")
def execute(self, window_start=None): self._claim_id_claim_type_dict = self._create_claim_id_claim_type_dictionary( ) directory_names = os.listdir(self._input_path) for dataset_name in directory_names: self._dataset_name = dataset_name #with or without retweets aggregation_retweets_type_directories = os.listdir( self._input_path + self._dataset_name) for aggregation_retweets_type in aggregation_retweets_type_directories: self._aggregation_retweets_type = aggregation_retweets_type target_path = "{0}/{1}/{2}/".format(self._input_path, self._dataset_name, aggregation_retweets_type) # read post_id_topic_id_dict_file with open(target_path + self._post_id_topic_id_dict_file) as file: self._post_id_topic_id_dict = json.load(file) self._topic_id_post_id_dict = { value: key for key, value in self._post_id_topic_id_dict.iteritems() } self._topic_statistics_df = pd.read_csv( target_path + self._topic_statistics_file) author_features = [] for index, row in self._topic_statistics_df.iterrows(): df_tuple = tuple(row) for i, feature_name in enumerate( self._features_extracted_by_tuple): msg = "\rCalculating features: [{0}/{1}: {2} {3} {4}]]".format( i, len(self._features_extracted_by_tuple), feature_name, self._dataset_name, self._aggregation_retweets_type) print(msg, end="") post_id, attribute_name, attribute_value = getattr( self, feature_name)(df_tuple) author_feature = BaseFeatureGenerator.create_author_feature( attribute_name, post_id, attribute_value, self._window_start, self._window_end) author_features.append(author_feature) for j, aggregated_feature in enumerate( self._aggregated_features): msg = "\rCalculating aggregated features: [{0}/{1} {2} {3} {4}]]".format( j, len(self._aggregated_features), aggregated_feature, self._dataset_name, self._aggregation_retweets_type) print(msg, end="") for topic_id, post_id in self._topic_id_post_id_dict.items( ): attribute_value = getattr(self, aggregated_feature)(topic_id) attribute_name = "{0}_{1}_{2}_{3}_{4}".format( self._prefix, aggregated_feature, self._dataset_name, self._aggregation_retweets_type, self._classifier_name) author_feature = BaseFeatureGenerator.create_author_feature( attribute_name, post_id, attribute_value, self._window_start, self._window_end) author_features.append(author_feature) self._db.add_author_features(author_features)
def execute(self, window_start=None): function_name = 'extract_temporal_features' start_time = time.time() info_msg = "execute started for " + function_name + " started at " + str( start_time) logging.info(info_msg) claims = self._db.get_claims() try: claim_features = [] today_datetime = datetime.datetime.now() posts_dict = self._db.get_claim_id_posts_dict() for cnt, claim in enumerate(claims): claim_id = claim.claim_id logging.info('Started ' + str(cnt + 1) + ' claim from ' + str(len(claims)) + ' claims') for source in self._source_list: # define authors,posts per claim if source == 'authors': s_list = self._db.get_claim_authors(claim_id) elif source == 'posts': s_list = posts_dict[claim_id] if len(s_list) == 0: logging.info('The resulted list is empty for claim:' + str(claim_id)) continue ll = [] for s in s_list: try: if source == 'authors': created_at = s[43] elif source == 'posts': created_at = s.created_at if created_at is not None: creation_date = parser.parse(created_at) delta = int( divmod((today_datetime - creation_date).total_seconds(), 60)[0]) ll.append(delta) else: logging.info( 'Can not be created feature for ' + created_at) except: logging.info( 'Can not be parsed created_at, probably None value' ) pass # normalization m = min(ll) lls = [i - m for i in ll] #sorting in ascending lls.sort() #init start stop indexes st_ind = 0 stop_ind = 1 for delta in self._delta_time: for idx, val in enumerate(lls[st_ind:]): if val <= eval(delta): stop_ind = idx llsn = deepcopy(lls[st_ind:stop_ind]) st_ind = stop_ind for ftr, feature_name in enumerate( self._features_list): logging.info('Started ' + str(ftr + 1) + ' feature from ' + str(len(self._features_list)) + ' features') try: attribute_value = getattr(self, feature_name)(llsn) except: attribute_value = 0 print('Fail in extraction: ' + feature_name) if attribute_value is not None: attribute_name = "{0}_{1}_{2}_{3}".format( self._prefix, source, str(eval(delta)), feature_name) # next line add envelope for feature claim_feature = BaseFeatureGenerator.create_author_feature( attribute_name, claim_id, attribute_value, self._window_start, self._window_end) claim_features.append(claim_feature) print('Appended: ' + attribute_name) except: logging.error('Failed in extraction process!') stop_time = time.time() info_msg = "execute ended at " + str(stop_time) logging.info(info_msg) self._db.add_author_features(claim_features)
def execute(self, window_start=None): function_name = "extract_features_from_graph" start_time = time.time() info_msg = "execute started for " + function_name + " started at " + str( start_time) logging.info(info_msg) try: claim_features = [] if self._csv_file != ' ': logging.info( 'Getting existing author connections from csv file') df = pd.read_csv(self._csv_file, names=[ 'source_author_guid', 'destination_author_guid', 'connection_type', 'weight', 'claim_id', 'insertion_date' ], low_memory=False) else: logging.info( 'Getting existing author connections with claim_id...') author_connections_with_claim_id = self._db.get_author_connections_with_claim_id( ) logging.info('Checking author connections for claim id...') author_connections_with_claim_id.extend( self._db.make_connections_with_claim_id()[0]) logging.info(author_connections_with_claim_id[0]) list_of_con_dicts = [] for author_con in author_connections_with_claim_id: connections_dict = { 'source_author_guid': author_con.source_author_guid, 'destination_author_guid': author_con.destination_author_guid, 'connection_type': author_con.connection_type, 'weight': author_con.weight, 'claim_id': author_con.claim_id, 'insertion_date': author_con.insertion_date } list_of_con_dicts.append(connections_dict) df = pd.DataFrame(list_of_con_dicts) grps = df.groupby(self._group_by) for cnt, grp in enumerate(grps): logging.info('Started ' + str(cnt) + ' group from ' + str(len(grps)) + ' groups') if nx.__version__[0] == '1': G = nx.from_pandas_dataframe(grp[1], self._source[0], self._target[0]) else: G = nx.from_pandas_edgelist(grp[1], self._source[0], self._target[0]) claim_ext_id = grp[0] #claim_id = self._db.claim_ext_id_to_claim_id(claim_ext_id)[0] claim_id = claim_ext_id if nx.__version__[ 0] > 1 and 'communicability_centrality' in self._features_list: self._features_list.remove('communicability_centrality') for ftr, feature_name in enumerate(self._features_list): logging.info('Started ' + str(ftr + 1) + ' feature from ' + str(len(self._features_list)) + ' features') attributes_dict = getattr(self, function_name)(G=G, ff=feature_name) if len(attributes_dict) == 1 and attributes_dict[ feature_name] is not None: attribute_name = "{0}_{1}".format( self._prefix, feature_name) # next line add envelope for feature claim_feature = BaseFeatureGenerator.create_author_feature( attribute_name, claim_id, attributes_dict[feature_name], self._window_start, self._window_end) claim_features.append(claim_feature) continue for sub_feature_name in ('min_', 'max_', 'median_', 'std_'): attribute_value = attributes_dict[sub_feature_name + feature_name] if attribute_value is not None: attribute_name = "{0}_{1}".format( self._prefix, sub_feature_name + feature_name) # next line add envelope for feature claim_feature = BaseFeatureGenerator.create_author_feature( attribute_name, claim_id, attribute_value, self._window_start, self._window_end) claim_features.append(claim_feature) except: logging.info('Fail') print(sys.exc_info()) stop_time = time.time() info_msg = "execute ended at " + str(stop_time) logging.info(info_msg) # used author_feature table self._db.add_author_features(claim_features)
def __init__(self, db, **kwargs): BaseFeatureGenerator.__init__(self, db, **kwargs) self._targeted_author_word_embeddings = self._config_parser.eval( self.__class__.__name__, "targeted_author_word_embeddings") self._max_objects_without_saving = self._config_parser.eval( self.__class__.__name__, "max_objects_without_saving")