def calculate_information_gain_star(dataset, classification_file, rule_set, rule_number): if 'users' in dataset: df_dataset = pandas.read_csv(dataset) attribute = rules.attributes(rule_set,rule_number) if isinstance(attribute,list): attribute_list = [] attrs = df_dataset[attribute].values for attr in attrs: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied = 1 else: number_satisfied = 0 attribute_list.append(number_satisfied) else: attribute_list = df_dataset[attribute].values df_classification = pandas.read_csv(classification_file) classification_list = df_classification['class'].values print(classification_list) print(attribute_list) information_gain_star = info_gain.info_gain(classification_list, attribute_list) else: attribute = rules.attributes(rule_set, rule_number) df_dataset = pandas.read_csv(dataset) user_id_list = list(set(df_dataset['user_id'].values)) df_classification = pandas.read_csv(classification_file) attr_values = [] real_classes = [] for user_id in user_id_list: df_user = df_dataset.loc[df_dataset['user_id'] == user_id] attribute_list = df_user[attribute].values attribute_value = attribute_list[0] number_satisfied = 0 if rule_number == 22: number_satisfied = len(df_user[attribute].unique()) elif rule_number == 3 and rule_set == 'social_bakers': number_satisfied = df_user[attribute].value_counts().max() else: for attr in attribute_list: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied += 1 attr_values.append(number_satisfied) df_class = df_classification.loc[df_classification['id'] == user_id] real_class = df_class['class'].values[0] real_classes.append(real_class) information_gain_star = info_gain.info_gain(real_classes, attr_values) return information_gain_star
def feature2(): dataset = pd.read_csv(BAS) dataset_tweets = pd.read_csv(BAS_TWEETS) dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True) users_id = dataset['id'].values users_id_tweets = dataset_tweets['user_id'].values users_id_tweets_list = users_id_tweets.tolist() tmp = [] tweets_count = [] # Checking if each ID appears more than 20 times in users_id_tweets for id in users_id: count = users_id_tweets_list.count(id) if count >= 20: tmp.append(1) else: tmp.append(0) tweets_count.append(count) ig = info_gain.info_gain(tmp, tweets_count) print("Information Gain: " + str(ig)) class_list = utils.read_dataset() print("Correlation coefficient: " + str(corrcoef(tweets_count, class_list)[0][1])) return tmp
def feature3(): print("Reading datasets...") dataset = pd.read_csv(BAS) dataset_tweets = pd.read_csv(BAS_TWEETS) dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True) print("Done") users_id = dataset['id'].values temp = [] similarities = [] for i in range(len(users_id)): print(i) all_user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id'] == users_id[i]] similarities.append(utils.message_similarity(all_user_tweets)) for similarity in similarities: if similarity > 100: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, similarities) print("Information Gain: " + str(ig)) class_list = utils.read_dataset() print("Correlation coefficient: " + str(corrcoef(similarities, class_list)[0][1])) return temp
def info_gain_calculate(tf_tweets_stems, classes): keys = [] info_gains = {} for i in tf_tweets_stems[0].keys(): keys.append(i) N = len(tf_tweets_stems) M = len(keys) i = 0 j = 0 while i < M: values = [] while j < N: values.append(tf_tweets_stems[j][keys[i]]) j = j + 1 ig = info_gain.info_gain(classes, values) info_gains[keys[i]] = ig """ if ig > 0.02: print(keys[i] + ': ' + str(ig)) """ i = i + 1 j = 0 return info_gains
def extract_metafeature(a): #statistical(3) #print(mean(a.kurtosis())) #print(mean(a.skew())) #print (mean(a.mean())) from sklearn.feature_selection import mutual_info_classif from info_gain import info_gain y=df[2000] X = df.drop(2000,1) ft2 = pd.DataFrame({ #simple 'nr_instances':[len(a)], 'nr_features':[len(a.columns)], 'nr_missing_values':[a.isnull().sum().sum()], #statistical #"max_value":[a.values.max()], #"min_value":[a.values.min()], 'mean_kurtosis':[mean(a.kurtosis())], 'mean_skewness':[mean(a.skew())], 'mean':[mean(a.mean())], #information_theoretic #'MI':[mean(mutual_info_classif(X, y))], #model_based 'Info_gain':[info_gain.info_gain(X,y)], #'Intistic_value':[info_gain.intrinsic_value(X,y)], 'Inf_gain_ratio':[info_gain.info_gain_ratio(X,y)] }) return(ft2)
def feature9(): dataset = pd.read_csv(BAS, low_memory=False) user_ids = dataset['id'].values current_year = 2015 ratios = [] temp = [] for i in range(len(user_ids)): friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values[0] created = dataset['created_at'].loc[dataset['id'] == user_ids[i]].values[0] year = created.split()[5] difference = current_year - int(year) ratios.append(friends / difference) for ratio in ratios: if ratio > 100: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, ratios) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(ratios, class_list)[0][1])) return temp
def calc_gains(data): gains = [] amount_of_columns = data.shape[1] for column_index in range(0, amount_of_columns): print(column_index) column = np.array(get_column(data, column_index)) gain = info_gain.info_gain(labels, column) gains.append([column_index, gain]) return gains
def calculate_info_gain(self): self.gain = [] for index, feature in enumerate(self.features): self.gain.append( [info_gain.info_gain(feature, self.labels), index]) self.gain.sort(key=self.take_first, reverse=True) print "=======================================================" print "{first} {second}".format(first=self.gain[0], second=self.gain[1]) print "======================================================="
def get_info_gain(train, classes): global info_gain_res # info_gain_res = dict(zip(col_name,mutual_info_classif(traindf, train_class, discrete_features=True))) for i in range(PIXELS): if i == 524: pass else: # train[str(i)] to access the dataframe column e.g, train['0'] info_gain_res.append(info_gain.info_gain(classes, train[str(i)])) # info_gain_res = mutual_info_classif(traindf, train_class, discrete_features=True) with open('./info_gain.txt', 'w') as f: print(info_gain_res, file=f) return info_gain_res
def extract_metafeature(a): y = a[a.columns[-1]] X = a[a.columns[:-1]] return { #simple 'nr_instances': len(a), 'nr_features': len(a.columns), 'nr_missing_values': a.isnull().sum().sum(), 'mean_kurtosis': mean(a.kurtosis()), 'mean_skewness': mean(a.skew()), 'mean': mean(a.mean()), 'Info_gain': info_gain.info_gain(X, y), 'Inf_gain_ratio': info_gain.info_gain_ratio(X, y) }
def feature1(): dataset = pd.read_csv(BAS) temp_list = [] friends_list = dataset['friends_count'].values for friends_count in friends_list: if friends_count >= 1000: temp_list.append(1) else: temp_list.append(0) ig = info_gain.info_gain(temp_list, friends_list) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("Correlation coefficient: " + str(corrcoef(friends_list, class_list)[0][1])) return temp_list
def feature4(): dataset = pd.read_csv(BAS) dataset_tweets = pd.read_csv(BAS_TWEETS) dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True) users_id = dataset['id'].values url_ratios = [] temp = [] for id in users_id: user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id'] == id] tweet_url_count = 0 for tweet in user_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(tweet)): tweet_url_count += 1 try: ratio = tweet_url_count / len(user_tweets) except ZeroDivisionError: ratio = 0 print(ratio) url_ratios.append(ratio) for ratio in url_ratios: if ratio >= 0.6: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, url_ratios) print("Information Gain: " + str(ig)) class_list = utils.read_dataset() print("Correlation coefficient: " + str(corrcoef(url_ratios, class_list)[0][1])) return temp
def feature5(): dataset = pd.read_csv(BAS) friends_list = dataset['friends_count'].values followers_list = dataset['followers_count'].values ratios = [] temp = [] for i in range(0, len(friends_list)): try: ratio = (friends_list[i] / (followers_list[i]**2)) except RuntimeWarning: ratio = 0 except ZeroDivisionError: ratio = 0 ratios.append(ratio) for i in range(len(ratios)): if isnan(ratios[i]): ratios[i] = 0 for i in range(len(ratios)): if isinf(ratios[i]): ratios[i] = 0 for ratio in ratios: if ratio < 0.1: temp.append(1) else: temp.append(0) ig = info_gain.info_gain(temp, ratios) print("Information Gain: " + str(ig)) class_list = utils.read_dataset() print("Correlation coefficient: " + str(corrcoef(ratios, class_list)[0][1])) return temp
def feature1(): dataset = pd.read_csv(BAS) creation_date = dataset['created_at'].values current_year = 2020 temp = [] age = [] for date in creation_date: year = date.split()[5] difference = current_year - int(year) if difference < 8: temp.append(0) else: temp.append(1) age.append(difference) ig = info_gain.info_gain(temp, age) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(age, class_list)[0][1])) return temp
def hierarchy_based_filter(df, label_column, G=None, threshold=0.99, metric="info_gain", pruning=True, all_remove=True, progress=True, **kwargs): """Feature selection approach, namely, SHSEL including the initial selection algorithm and pruning algorithm. Identify and filter out the ranges of nodes with similar relevance in each branch of the hierarchy. Ristoski, P. and Paulheim, H., 2014, October. Feature selection in hierarchical feature spaces. In International conference on discovery science (pp. 288-300). Springer, Cham. Args: df (pd.DataFrame): Dataframe containing the original features and the class column. label_column (str): Name of the output/class column. G (nx.DirectedGraph, optional): The directed graph of all classes and superclasses can be specified here; if None the function looks for the graph in the pd.DataFrame.attrs.hierarchy attribute of the input dataframe. Defaults to None. threshold (float, optional): A relevance similarity threshold which is set be users, recommended to be 0.99. Defaults to 0.99. metric (str/func, optional): The relevance similarity metrics including infomation gain and correlation("info_gain"/"correlation"). Can use your own metric function. Defaults to "info_gain". pruning (bool, optional): If or not use the pruning algorithm, if True, select only the most valuable features which is greater than the average Information Gain values from the previously reduced set. Defaults to True. all_remove (bool, optional): Only valid when pruning is True. If or not strictly remove all the nodes once one of their info gain value are smaller than the average info gain of paths. Defaults to True. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. Returns: pd.DataFrame: Filtered Dataframe containing the selected attributes. """ # Take graph attached to df or selected by user. if G == None: G = df.attrs["hierarchy"].copy() elif G: G = G.copy() else: raise RuntimeError("""No hierarchy graph found. It should either be attached to the dataframe in df.attrs['hierarchy] or passed in the G argument.""") df = df.copy() # delete and save prefix strings, e.g. "uri_bool_" to comply with graph prefix_cols = [col for col in df.columns if re.findall("http:", col)] prefix_cols_stripped = [ re.sub(r"^.*?http://", "http://", col) for col in prefix_cols ] renaming_dict = dict(zip(prefix_cols_stripped, prefix_cols)) # preparing part df.columns = [re.sub(r"^.*?http://", "http://", col) for col in df.columns] # save class col and columns without features for later label_column = re.sub(r"^.*?http://", "http://", label_column) non_class_cols = list(set(df.columns) - set(G.nodes) - set([label_column])) class_col = df.loc[:, label_column] non_class_df = df.loc[:, non_class_cols] #main part df_from_hierarchy = add_hierarchy_columns(df, G, keep_prefix=False) G = G.reverse() if not nx.is_directed_acyclic_graph(G): raise TypeError( "The Hierarchy Based Filter is designed for directed acyclic graphs (DAGs)." ) node_availability = {} ig_values = [] if progress: iterator = tqdm(list(G.nodes()), desc="Hierarchy Based Filter: Initial Selection") else: iterator = list(G.nodes()) #for node in list(G.nodes()): for node in iterator: node_availability[node] = True ig = info_gain.info_gain(df_from_hierarchy[label_column], df_from_hierarchy[node]) ig_values.append(ig) #global node_values node_values = dict(zip(G.nodes, ig_values)) # the main structure of Inital Selection L = [x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) > 0] for l in L: D = G.predecessors(l) # direct ancestors of the current leaf l D = list(D) # necessary! transform keydict_iterator type to list # selection by similarity for d in D: if callable(metric): similarity = metric(df_from_hierarchy, l, d, **kwargs) elif metric == "info_gain": similarity = 1 - abs(node_values[d] - node_values[l]) elif metric == "correlation": similarity = np.corrcoef(df_from_hierarchy[l], df_from_hierarchy[d])[0, 1] if similarity >= threshold or np.isnan(similarity) == True: node_availability[l] = False break # extend L by D newleaf = [d for d in D if d not in L] L.extend(newleaf) SF = [node for node in list(G.nodes()) if node_availability[node] == True] df_filtered = df_from_hierarchy.copy() for col in df_from_hierarchy.columns: if col not in SF or col not in df.columns: df_filtered.drop(col, axis=1, inplace=True) if pruning: df_filtered = prune(df_filtered, G, node_values, node_availability, L, remove_flag=all_remove, progress=progress) df_filtered = pd.concat([non_class_df, class_col, df_filtered], axis=1) df_filtered.rename(columns=renaming_dict, inplace=True) return df_filtered
def feature8(): timenow = datetime.datetime.now() e13_tweets = pd.read_csv(E13_tweets) fsf_tweets = pd.read_csv(FSF_tweets) int_tweets = pd.read_csv(INT_tweets) tfp_tweets = pd.read_csv(TFP_tweets) twt_tweets = pd.read_csv(TWT_tweets) dataset = pd.read_csv(BAS, low_memory=False) user_ids = dataset['id'].values bas_dataset = dataset['dataset'].values total = [] temp = [] for i in range(len(user_ids)): api_tweets = [] if bas_dataset[i] == 'E13': tweets = e13_tweets['text'].loc[e13_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass similarity_count = utils.message_similarity(api_tweets) total.append(similarity_count) elif bas_dataset[i] == 'FSF': tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass similarity_count = utils.message_similarity(api_tweets) total.append(similarity_count) elif bas_dataset[i] == 'INT': tweets = int_tweets['text'].loc[int_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass similarity_count = utils.message_similarity(api_tweets) total.append(similarity_count) elif bas_dataset[i] == 'TFP': tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass similarity_count = utils.message_similarity(api_tweets) total.append(similarity_count) elif bas_dataset[i] == 'TWT': tweets = twt_tweets['text'].loc[twt_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass similarity_count = utils.message_similarity(api_tweets) total.append(similarity_count) for i in range(len(total)): if isnan(total[i]): total[i] = 0 for count in total: if count > 10: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, total) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(total, class_list)[0][1])) timeend = datetime.datetime.now() print("TIME TAKEN: " + str(timeend - timenow)) pass
def feature7(): e13_tweets = pd.read_csv(E13_tweets) fsf_tweets = pd.read_csv(FSF_tweets) int_tweets = pd.read_csv(INT_tweets) tfp_tweets = pd.read_csv(TFP_tweets) twt_tweets = pd.read_csv(TWT_tweets) dataset = pd.read_csv(BAS, low_memory=False) user_ids = dataset['id'].values bas_dataset = dataset['dataset'].values ratios = [] temp = [] for i in range(len(user_ids)): api_tweetsurl_count = 0 api_tweets = [] print(i) if bas_dataset[i] == 'E13': tweets = e13_tweets['text'].loc[e13_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass for api_tweet in api_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(api_tweet)): api_tweetsurl_count += 1 if api_tweetsurl_count == 0: ratios.append(0) else: ratios.append(api_tweetsurl_count / len(api_tweets)) elif bas_dataset[i] == 'FSF': tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass for api_tweet in api_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(api_tweet)): api_tweetsurl_count += 1 if api_tweetsurl_count == 0: ratios.append(0) else: ratios.append(api_tweetsurl_count / len(api_tweets)) elif bas_dataset[i] == 'INT': tweets = int_tweets['text'].loc[int_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass for api_tweet in api_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(api_tweet)): api_tweetsurl_count += 1 if api_tweetsurl_count == 0: ratios.append(0) else: ratios.append(api_tweetsurl_count / len(api_tweets)) elif bas_dataset[i] == 'TFP': tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass for api_tweet in api_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(api_tweet)): api_tweetsurl_count += 1 if api_tweetsurl_count == 0: ratios.append(0) else: ratios.append(api_tweetsurl_count / len(api_tweets)) elif bas_dataset[i] == 'TWT': tweets = twt_tweets['text'].loc[twt_tweets['user_id'] == user_ids[i]] for tweet in tweets: if "API" or "AutoBot" in tweet: api_tweets.append(tweet) else: pass for api_tweet in api_tweets: if re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(api_tweet)): api_tweetsurl_count += 1 if api_tweetsurl_count == 0: ratios.append(0) else: ratios.append(api_tweetsurl_count / len(api_tweets)) for ratio in ratios: if ratio > 0.8: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, ratios) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(ratios, class_list)[0][1])) pass
def feature2(): dataset = pd.read_csv('../datasets/BAS/bas_users.csv') e13_followers = pd.read_csv(E13_followers) fsf_followers = pd.read_csv(FSF_followers) int_followers = pd.read_csv(INT_followers) tfp_followers = pd.read_csv(TFP_followers) twt_followers = pd.read_csv(TWT_followers) bas_ids = dataset['id'].values bas_dataset = dataset['dataset'].values bas_friends = dataset['friends_count'].values ratios = [] temp = [] for i in range(0, len(bas_ids)): count = 0 print(i) try: if bas_dataset[i] == 'E13': followers_of_id = e13_followers['source_id'].loc[ e13_followers['target_id'] == bas_ids[i]].values for id in followers_of_id: try: forward = e13_followers['target_id'].loc[ e13_followers['source_id'] == id].values if forward[0] == bas_ids[i]: count += 1 else: pass except KeyError: pass ratio = count / bas_friends[i] ratios.append(ratio) elif bas_dataset[i] == 'TFP': followers_of_id = tfp_followers['source_id'].loc[ tfp_followers['target_id'] == bas_ids[i]].values for id in followers_of_id: try: forward = tfp_followers['target_id'].loc[ tfp_followers['source_id'] == id].values if forward[0] == bas_ids[i]: count += 1 else: pass except KeyError: pass ratio = count / bas_friends[i] ratios.append(ratio) elif bas_dataset[i] == 'FSF': followers_of_id = fsf_followers['source_id'].loc[ fsf_followers['target_id'] == bas_ids[i]].values for id in followers_of_id: try: forward = fsf_followers['target_id'].loc[ fsf_followers['source_id'] == id].values if forward[0] == bas_ids[i]: count += 1 else: pass except KeyError: pass ratio = count / bas_friends[i] ratios.append(ratio) elif bas_dataset[i] == 'INT': followers_of_id = int_followers['source_id'].loc[ int_followers['target_id'] == bas_ids[i]].values for id in followers_of_id: try: forward = int_followers['target_id'].loc[ int_followers['source_id'] == id].values if forward[0] == bas_ids[i]: count += 1 else: pass except KeyError: pass ratio = count / bas_friends[i] ratios.append(ratio) elif bas_dataset[i] == 'TWT': followers_of_id = twt_followers['source_id'].loc[ twt_followers['target_id'] == bas_ids[i]].values for id in followers_of_id: try: forward = twt_followers['target_id'].loc[ twt_followers['source_id'] == id].values if forward[0] == bas_ids[i]: count += 1 else: pass except KeyError: pass ratio = count / bas_friends[i] ratios.append(ratio) except: pass for ratio in ratios: if isnan(ratio): ratio = 0 else: pass for ratio in ratios: if ratio < 0.5: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, ratios) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(ratios, class_list)[0][1])) return temp
def feature6(): e13_tweets = pd.read_csv(E13_tweets) fsf_tweets = pd.read_csv(FSF_tweets) int_tweets = pd.read_csv(INT_tweets) tfp_tweets = pd.read_csv(TFP_tweets) twt_tweets = pd.read_csv(TWT_tweets) dataset = pd.read_csv(BAS, low_memory=False) user_ids = dataset['id'].values bas_dataset = dataset['dataset'].values ratios = [] temp = [] for i in range(len(user_ids)): api_tweets_count = 0 print(i) if bas_dataset[i] == 'E13': sources_from_id = e13_tweets['source'].loc[e13_tweets['user_id'] == user_ids[i]] tweets_count = dataset['statuses_count'].loc[dataset['id'] == user_ids[i]].values for source_id in sources_from_id: if "API" or "AutoTwitter" in source_id: api_tweets_count += 1 else: pass if api_tweets_count == 0: ratios.append(0) else: ratios.append(tweets_count[0] / api_tweets_count) elif bas_dataset[i] == 'FSF': sources_from_id = fsf_tweets['source'].loc[fsf_tweets['user_id'] == user_ids[i]] tweets_count = dataset['statuses_count'].loc[dataset['id'] == user_ids[i]].values for source_id in sources_from_id: if "API" or "AutoTwitter" in source_id: api_tweets_count += 1 else: pass if api_tweets_count == 0: ratios.append(0) else: ratios.append(tweets_count[0] / api_tweets_count) elif bas_dataset[i] == 'INT': sources_from_id = int_tweets['source'].loc[int_tweets['user_id'] == user_ids[i]] tweets_count = dataset['statuses_count'].loc[dataset['id'] == user_ids[i]].values for source_id in sources_from_id: if "API" or "AutoTwitter" in source_id: api_tweets_count += 1 else: pass if api_tweets_count == 0: ratios.append(0) else: ratios.append(tweets_count[0] / api_tweets_count) elif bas_dataset[i] == 'TFP': sources_from_id = tfp_tweets['source'].loc[tfp_tweets['user_id'] == user_ids[i]] tweets_count = dataset['statuses_count'].loc[dataset['id'] == user_ids[i]].values for source_id in sources_from_id: if "API" or "AutoTwitter" in source_id: api_tweets_count += 1 else: pass if api_tweets_count == 0: ratios.append(0) else: ratios.append(tweets_count[0] / api_tweets_count) elif bas_dataset[i] == 'TWT': sources_from_id = twt_tweets['source'].loc[twt_tweets['user_id'] == user_ids[i]] tweets_count = dataset['statuses_count'].loc[dataset['id'] == user_ids[i]].values for source_id in sources_from_id: if "API" or "AutoTwitter" in source_id: api_tweets_count += 1 else: pass if api_tweets_count == 0: ratios.append(0) else: ratios.append(tweets_count[0] / api_tweets_count) for i in range(len(ratios)): if isnan(ratios[i]): ratios[i] = 0 for ratio in ratios: print(ratio) if ratio > 1.03: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, ratios) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(ratios, class_list)[0][1])) pass
def feature5(): e13_followers = pd.read_csv(E13_followers) fsf_followers = pd.read_csv(FSF_followers) int_followers = pd.read_csv(INT_followers) tfp_followers = pd.read_csv(TFP_followers) twt_followers = pd.read_csv(TWT_followers) dataset = pd.read_csv(BAS) user_ids = dataset['id'].values bas_dataset = dataset['dataset'].values medians = [] friends_count = [] for i in range(len(user_ids)): print(i) id_followers = [] if bas_dataset[i] == 'E13': source_ids = e13_followers['target_id'].loc[ e13_followers['source_id'] == user_ids[i]] friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values friends_count.append(friends[0]) for id in source_ids: source_source_ids = e13_followers['target_id'].loc[ e13_followers['source_id'] == id].values for source_id in source_source_ids: followers_count = dataset['followers_count'].loc[ dataset['id'] == source_id].values if followers_count: id_followers.append(followers_count) else: pass medians.append(median(id_followers)) elif bas_dataset[i] == 'FSF': source_ids = fsf_followers['target_id'].loc[ fsf_followers['source_id'] == user_ids[i]] friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values friends_count.append(friends[0]) for id in source_ids: source_source_ids = fsf_followers['target_id'].loc[ fsf_followers['source_id'] == id].values for source_id in source_source_ids: followers_count = dataset['followers_count'].loc[ dataset['id'] == source_id].values if followers_count: id_followers.append(followers_count) else: pass medians.append(median(id_followers)) elif bas_dataset[i] == 'INT': source_ids = int_followers['target_id'].loc[ int_followers['source_id'] == user_ids[i]] friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values friends_count.append(friends[0]) for id in source_ids: source_source_ids = int_followers['target_id'].loc[ int_followers['source_id'] == id].values for source_id in source_source_ids: followers_count = dataset['followers_count'].loc[ dataset['id'] == source_id].values if followers_count: id_followers.append(followers_count) else: pass medians.append(median(id_followers)) elif bas_dataset[i] == 'TFP': source_ids = tfp_followers['target_id'].loc[ tfp_followers['source_id'] == user_ids[i]] friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values friends_count.append(friends[0]) for id in source_ids: source_source_ids = tfp_followers['target_id'].loc[ tfp_followers['source_id'] == id].values for source_id in source_source_ids: followers_count = dataset['followers_count'].loc[ dataset['id'] == source_id].values if followers_count: id_followers.append(followers_count) else: pass medians.append(median(id_followers)) elif bas_dataset[i] == 'TWT': source_ids = twt_followers['target_id'].loc[ twt_followers['source_id'] == user_ids[i]] friends = dataset['friends_count'].loc[dataset['id'] == user_ids[i]].values friends_count.append(friends[0]) for id in source_ids: source_source_ids = twt_followers['target_id'].loc[ twt_followers['source_id'] == id].values for source_id in source_source_ids: followers_count = dataset['followers_count'].loc[ dataset['id'] == source_id].values if followers_count: id_followers.append(followers_count) else: pass medians.append(median(id_followers)) for i in range(len(medians)): if isnan(medians[i]): medians[i] = 0 temp = [] ratios = [] for i in range(len(medians)): if medians[i] == 0: ratio = 0 else: ratio = friends_count[i] / medians[i] ratios.append(ratio) for ratio in ratios: if ratio < 1.5: temp.append(1) else: temp.append(0) ig = info_gain.info_gain(temp, ratios) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(ratios, class_list)[0][1])) return temp
def feature4(): dataset = pd.read_csv(BAS) e13_followers = pd.read_csv(E13_followers) fsf_followers = pd.read_csv(FSF_followers) int_followers = pd.read_csv(INT_followers) tfp_followers = pd.read_csv(TFP_followers) twt_followers = pd.read_csv(TWT_followers) bas_ids = dataset['id'].values bas_dataset = dataset['dataset'].values tweets_count = [] global_tweets_count = [] temp = [] for i in range(len(bas_ids)): print(i) if bas_dataset[i] == 'E13': id_followers = e13_followers['target_id'].loc[ e13_followers['source_id'] == bas_ids[i]].values for follower in id_followers: tweets = dataset['statuses_count'].loc[dataset['id'] == follower].values if tweets: tweets_count.append(tweets) else: pass global_tweets_count.append(average(tweets_count)) elif bas_dataset[i] == 'FSF': id_followers = fsf_followers['target_id'].loc[ fsf_followers['source_id'] == bas_ids[i]].values for follower in id_followers: tweets = dataset['statuses_count'].loc[dataset['id'] == follower].values if tweets: tweets_count.append(tweets) else: pass global_tweets_count.append(average(tweets_count)) elif bas_dataset[i] == 'INT': id_followers = int_followers['target_id'].loc[ int_followers['source_id'] == bas_ids[i]].values for follower in id_followers: tweets = dataset['statuses_count'].loc[dataset['id'] == follower].values if tweets: tweets_count.append(tweets) else: pass global_tweets_count.append(average(tweets_count)) elif bas_dataset[i] == 'TFP': id_followers = tfp_followers['target_id'].loc[ tfp_followers['source_id'] == bas_ids[i]].values for follower in id_followers: tweets = dataset['statuses_count'].loc[dataset['id'] == follower].values if tweets: tweets_count.append(tweets) else: pass global_tweets_count.append(average(tweets_count)) elif bas_dataset[i] == 'TWT': id_followers = twt_followers['target_id'].loc[ twt_followers['source_id'] == bas_ids[i]].values for follower in id_followers: tweets = dataset['statuses_count'].loc[dataset['id'] == follower].values if tweets: tweets_count.append(tweets) else: pass global_tweets_count.append(average(tweets_count)) for mean_value in global_tweets_count: if mean_value < 9000: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, global_tweets_count) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(global_tweets_count, class_list)[0][1])) return temp
def feature3(): dataset = pd.read_csv(BAS) e13_friends = pd.read_csv(E13_friends) fsf_friends = pd.read_csv(FSF_friends) int_friends = pd.read_csv(INT_friends) tfp_friends = pd.read_csv(TFP_friends) twt_friends = pd.read_csv(TWT_friends) bas_ids = dataset['id'].values bas_dataset = dataset['dataset'].values followers_count = [] averages = [] temp = [] for i in range(len(bas_ids)): print(i) if bas_dataset[i] == 'E13': friends = e13_friends['target_id'].loc[e13_friends['source_id'] == bas_ids[i]].values for friend in friends: friend_followers = dataset['followers_count'].loc[ dataset['id'] == friend].values if friend_followers: followers_count.append(friend_followers) else: pass averages.append(average(followers_count)) elif bas_dataset[i] == 'FSF': friends = fsf_friends['target_id'].loc[fsf_friends['source_id'] == bas_ids[i]].values for friend in friends: friend_followers = dataset['followers_count'].loc[ dataset['id'] == friend].values if friend_followers: followers_count.append(friend_followers) else: pass averages.append(average(followers_count)) elif bas_dataset[i] == 'INT': friends = int_friends['target_id'].loc[int_friends['source_id'] == bas_ids[i]].values for friend in friends: friend_followers = dataset['followers_count'].loc[ dataset['id'] == friend].values if friend_followers: followers_count.append(friend_followers) else: pass averages.append(average(followers_count)) elif bas_dataset[i] == 'TFP': friends = tfp_friends['target_id'].loc[tfp_friends['source_id'] == bas_ids[i]].values for friend in friends: friend_followers = dataset['followers_count'].loc[ dataset['id'] == friend].values if friend_followers: followers_count.append(friend_followers) else: pass averages.append(average(followers_count)) elif bas_dataset[i] == 'TWT': friends = twt_friends['target_id'].loc[twt_friends['source_id'] == bas_ids[i]].values for friend in friends: friend_followers = dataset['followers_count'].loc[ dataset['id'] == friend].values if friend_followers: followers_count.append(friend_followers) else: pass averages.append(average(followers_count)) for mean_value in averages: if mean_value < 25000: temp.append(0) else: temp.append(1) ig = info_gain.info_gain(temp, averages) print("INFORMATION GAIN: " + str(ig)) class_list = utils.read_dataset() print("PEARSON CORRELATION COEFFICIENT: " + str(corrcoef(averages, class_list)[0][1])) return temp
def gain(classifier, attribute): return ig.info_gain(classifier, attribute)
print("entropy Type", entropyType) probDoors = [ float(Doors.count(c)) / len(Doors) for c in dict.fromkeys(list(Doors)) ] entropyDoors = -sum([p * math.log(p) / math.log(2.0) for p in probDoors]) print("entropy Doors", entropyDoors) probTyres = [ float(Tyres.count(c)) / len(Tyres) for c in dict.fromkeys(list(Tyres)) ] entropyTyres = -sum([p * math.log(p) / math.log(2.0) for p in probTyres]) print("entropy Tyres", entropyTyres) igcolor = info_gain.info_gain(Color, Class) print("Color Info Gain", igcolor) igtype = info_gain.info_gain(Type, Class) print("Type Info Gain", igtype) igdoors = info_gain.info_gain(Doors, Class) print("Doors Info Gain", igdoors) igtyres = info_gain.info_gain(Tyres, Class) print("Tyres Info Gain", igtyres)
def calculate_information_gain(classification_file, rule_set,rule_number): df_classification = pandas.read_csv(classification_file) output_list = df_classification['output'].values classification_list = df_classification['class'].values information_gain = info_gain.info_gain(classification_list, output_list) return information_gain
def get_info_gain_ranking(X, y): feat_gain = [] for j in range(X.shape[1]): feat_gain.append(info_gain.info_gain(y, X[:, j])) return feat_gain
rounded=True, filled=True) # Gini decides which attribute/feature should be placed at the root node, # which features will act as internal nodes or leaf nodes # Create Graph from DOT data graph = pydotplus.graph_from_dot_data(dot_data) # Create Decision Tree PDF graph.write_pdf("DT1_Breast_Cancer.pdf") ###################################### # Run an information gain evaluation ###################################### print('\nInformation Gain on Recurrence') ig = info_gain.info_gain(df['recur_event'], df['Tumor_Size']) print('\tTumor Size=', ig) ig = info_gain.info_gain(df['recur_event'], df['Menopause']) print('\tMenopause=', ig) ig = info_gain.info_gain(df['recur_event'], df['Age_Range']) print('\tAge Range=', ig) ig = info_gain.info_gain(df['recur_event'], df['Degree_Malignant']) print('\tDegree Malignant=', ig) ig = info_gain.info_gain(df['recur_event'], df['inv_nodes']) print('\tNumber Involved Nodes=', ig) ig = info_gain.info_gain(df['recur_event'], df['breast_quad'])
#GaussianNB/16columns from sklearn.naive_bayes import GaussianNB g_nb = GaussianNB(priors = None) g_nb_fit = g_nb.fit(x_train,y_train) g_nb_pred = g_nb.predict(x_test) print(confusion_matrix(y_test,g_nb_pred)) print('\n') print(classification_report(y_test,g_nb_pred)) #info_gain,Gain_Ratio/14columns !pip install info_gain from info_gain import info_gain noShow_plus = noShow.drop('Status',axis=1) for item in noShow_plus: ig = info_gain.info_gain(noShow[item], noShow['Status']) igr = info_gain.info_gain_ratio(noShow[item], noShow['Status']) print("%s的info_gain:" %(item),ig) print("%s的Gain_Ratio:" %(item),igr) #info_gain,Gain_Ratio/16columns !pip install info_gain from info_gain import info_gain noShow_plus = noShow.drop('Status',axis=1) for item in noShow_plus: ig = info_gain.info_gain(noShow[item], noShow['Status']) igr = info_gain.info_gain_ratio(noShow[item], noShow['Status']) print("%s的info_gain:" %(item),ig) print("%s的Gain_Ratio:" %(item),igr)
def tree_based_filter(df, label_column, G=None, metric="Lift", progress=True): """Filter attributes with Tree-Based Feature Selection (TSEL). TSEL selects the most valuable attributes from each path in the hierarchy, based on lift or information gain. Jeong, Y. and Myaeng, S.H., 2013, October. Feature selection using a semantic hierarchy for event recognition and type classification. In Proceedings of the Sixth International Joint Conference on Natural Language Processing (pp. 136-144). Args: df (pd.DataFrame): Dataframe with hierarchy (output of generator) label_column (str): Name of the column with the class/label G (nx.DirectedGraph, optional): The directed graph of all classes and superclasses can be specified here; if None the function looks for the graph in the pd.DataFrame.attrs.hierarchy attribute of the input dataframe. Defaults to None. metric (str/func, optional): Metric which is used to determine the representative features (IG/Lift). Defaults to 'Lift'. progress (bool, optional): If True, progress updates will be shown to inform the user about the progress made by the process. Defaults to True. Returns: pd.DataFrame: Filtered Dataframe containing the selected attributes. """ df = df.copy() if G: G = G.copy() else: G = df.attrs["hierarchy"].copy() if progress: print("Tree Based Filter - (1/4) Initialization.") # delete and save prefix strings, e.g. 'uri_bool_" to comply with graph prefix_cols = [col for col in df.columns if re.findall("http:", col)] prefix_cols_stripped = [ re.sub(r"^.*?http://", "http://", col) for col in prefix_cols ] renaming_dict = dict(zip(prefix_cols_stripped, prefix_cols)) df.columns = [re.sub(r"^.*?http://", "http://", col) for col in df.columns] # save class col and columns without features for later label_column = re.sub(r"^.*?http://", "http://", label_column) non_class_cols = list(set(df.columns) - set(G.nodes) - set([label_column])) df_from_hierarchy = add_hierarchy_columns(df, G, keep_prefix=False) # tsel is a top-down algorithm ==> graph has to be reversed G = G.reverse() # add virtual root node roots_and_isolated_nodes = [ x for x in G.nodes() if G.out_degree(x) >= 0 and G.in_degree(x) == 0 ] for node in roots_and_isolated_nodes: G.add_edge("VRN", node) if progress: print("Tree Based Filter - (2/4) Calculate Metric Values.") if callable(metric): node_metrics = metric(df_from_hierarchy, G, label_column) elif metric == "IG": metrics = [] for node in G.nodes: if node != "VRN": ig = info_gain.info_gain(df_from_hierarchy[label_column], df_from_hierarchy[node]) metrics.append(ig) node_metrics = dict(zip(G.nodes, metrics)) else: node_metrics = calculate_lift(df_from_hierarchy, G, label_column) representative_features = [] # traverse all paths if progress: print("Tree Based Filter - (3/4) Get initial representative features.") for p in get_all_paths(G, "VRN"): # select representative feature feature = representative_feature(p, node_metrics) if feature not in representative_features: representative_features.append(feature) if progress: print("Tree Based Filter - (4/4) Update representative features.") # loop over representative features checkUpdated = True while checkUpdated == True: checkUpdated = False for feature in representative_features: # loop over all descendants for desc in nx.descendants(G, feature): # check if descendant is representative feature if desc in representative_features: representative_features.remove(feature) # loop over all direct child nodes of x for child in nx.neighbors(G, feature): # loop over all paths from child to leaf nodes for p in get_all_paths(G, child): # select representative feature feature = representative_feature(p, node_metrics) if feature not in representative_features: representative_features.append(feature) checkUpdated = True break # loop again if representative nodes were updated if checkUpdated == True: break if label_column in representative_features: representative_features.remove(label_column) df_filtered = df_from_hierarchy.loc[:, non_class_cols + [label_column] + representative_features] df_filtered.columns = non_class_cols + [label_column ] + representative_features df_filtered.rename(columns=renaming_dict, inplace=True) return df_filtered
imputed = imputer.predict(data) newdata1.loc[np.where(pd.isna(data[i]) == True)[0], i] = imputed.iloc[np.where(pd.isna(data[i]) == True)[0], len(imputed.columns) - 2] orig = [] tech = [] deep = [] missing = [] var = [] for i in data.columns: if type_var[i] not in ['int64', 'float64' ] and sum(pd.isna(data[i]) == True) > 0: var.append(i) orig.append(info_gain.info_gain(list(data[target]), list(data[i]))) tech.append( info_gain.info_gain(list(newdata[target]), list(newdata[i]))) missing.append(sum(pd.isna(data[i]) == True) / len(data)) ixx = np.where(pd.isna(data[i]) == True)[0] newdata.loc[ixx, i] newdata1.loc[ixx, i] result = pd.DataFrame({ 'size_missing': missing, 'Var': var, 'Orig': orig, 'Tech': tech }) newdata2 = data.copy()