def calculate_information_gain_star(dataset, classification_file, rule_set, rule_number): if 'users' in dataset: df_dataset = pandas.read_csv(dataset) attribute = rules.attributes(rule_set,rule_number) if isinstance(attribute,list): attribute_list = [] attrs = df_dataset[attribute].values for attr in attrs: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied = 1 else: number_satisfied = 0 attribute_list.append(number_satisfied) else: attribute_list = df_dataset[attribute].values df_classification = pandas.read_csv(classification_file) classification_list = df_classification['class'].values print(classification_list) print(attribute_list) information_gain_star = info_gain.info_gain(classification_list, attribute_list) else: attribute = rules.attributes(rule_set, rule_number) df_dataset = pandas.read_csv(dataset) user_id_list = list(set(df_dataset['user_id'].values)) df_classification = pandas.read_csv(classification_file) attr_values = [] real_classes = [] for user_id in user_id_list: df_user = df_dataset.loc[df_dataset['user_id'] == user_id] attribute_list = df_user[attribute].values attribute_value = attribute_list[0] number_satisfied = 0 if rule_number == 22: number_satisfied = len(df_user[attribute].unique()) elif rule_number == 3 and rule_set == 'social_bakers': number_satisfied = df_user[attribute].value_counts().max() else: for attr in attribute_list: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied += 1 attr_values.append(number_satisfied) df_class = df_classification.loc[df_classification['id'] == user_id] real_class = df_class['class'].values[0] real_classes.append(real_class) information_gain_star = info_gain.info_gain(real_classes, attr_values) return information_gain_star
def check_rule(rule_set, number, row): attribute = rules.attributes(rule_set, number) if isinstance(attribute, list): attribute_value = [] for attr in attribute: attribute_value.append(row[attr]) else: attribute_value = row[attribute] rule_output = rules.rules(rule_set, number, attribute_value) return rule_output
def check_rule(rule_set, number, rules_dict, row): attribute = rules.attributes(rule_set, number) if isinstance(attribute, list): attribute_value = [] for attr in attribute: attribute_value.append(row[attr]) else: attribute_value = row[attribute] rule_output = rules.rules(rule_set, number, attribute_value) if rule_output == 1: rule_index = 'rule_' + str(number) rules_dict[rule_index] = True return rules_dict
def check_vdb_rule(rule_set, number, row, df_row): rule_satisfied = 0 attribute = rules.attributes(rule_set, number) if number == 4: attribute_value = [] attribute_value.append(row[attribute]) attribute_value.append(df_row[attribute]) elif number == 3: attribute_value = df_row else: if isinstance(attribute, list): attribute_value = [] for attr in attribute: attribute_value.append(row[attr]) else: attribute_value = row[attribute] rule_output = rules.rules(rule_set, number, attribute_value) return rule_output
def calculate_pearson_correlation_coefficient_star(bas_dataset, classification_file, rule_set, rule_number): #calculate pearson correlation coefficient * for a non-numerical attribute of the users dataset if 'users' in bas_dataset and ((rule_number in [2,3,4,9] and rule_set == 'camsani_calzolari') or (rule_number in [1,4] and rule_set == 'van_den_beld')): attribute = rules.attributes(rule_set, rule_number) df_dataset = pandas.read_csv(bas_dataset) attribute_list = df_dataset[attribute].values attr_values = [] for attr in attribute_list: if rule_number == 4: rule_output = rules.rules(rule_set, rule_number, [attr, df_dataset[attribute]]) else: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied = 1 else: number_satisfied = 0 attr_values.append(number_satisfied) df_attr = pandas.DataFrame(attr_values, columns =[attribute]) df_classification = pandas.read_csv(classification_file) df_class = df_classification['class'] df_pcc_star = pandas.concat([df_attr, df_class], axis=1) pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson') #calculate pearson correlation coefficient * for a numerical attribute of the users dataset elif 'users' in bas_dataset: attribute = rules.attributes(rule_set, rule_number) df_dataset = pandas.read_csv(bas_dataset) df_attribute = df_dataset[attribute] df_numerical_attribute = df_attribute.fillna(0) df_classification = pandas.read_csv(classification_file) df_class = df_classification['class'] df_pcc_star = pandas.concat([df_numerical_attribute, df_class], axis=1) pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson') #calculate pearson correlation coefficient * for an attribute of the tweets dataset else: attribute = rules.attributes(rule_set, rule_number) df_dataset = pandas.read_csv(bas_dataset) user_id_list = list(set(df_dataset['user_id'].values)) df_classification = pandas.read_csv(classification_file) attr_values = [] real_classes = [] for user_id in user_id_list: df_user = df_dataset.loc[df_dataset['user_id'] == user_id] attribute_list = df_user[attribute].values attribute_value = attribute_list[0] number_satisfied = 0 if rule_number == 22: number_satisfied = len(df_user[attribute].unique()) elif rule_number == 3 and rule_set == 'social_bakers': number_satisfied = df_user[attribute].value_counts().max() else: for attr in attribute_list: rule_output = rules.rules(rule_set, rule_number, attr) if rule_output == 1: number_satisfied += 1 attr_values.append(number_satisfied) df_class = df_classification.loc[df_classification['id'] == user_id] real_class = df_class['class'].values real_classes.append(real_class) df_attr = pandas.DataFrame(attr_values, columns =[attribute]) df_cl = pandas.DataFrame(real_classes, columns =['class']) df_pcc_star = pandas.concat([df_attr, df_cl], axis=1) pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson') return pearson_correlation_coefficient_star