Пример #1
0
def calculate_information_gain_star(dataset, classification_file, rule_set, rule_number):
    if 'users' in dataset:
        df_dataset = pandas.read_csv(dataset)
        attribute = rules.attributes(rule_set,rule_number)
        if isinstance(attribute,list):
            attribute_list = []
            attrs = df_dataset[attribute].values
            for attr in attrs:
                rule_output = rules.rules(rule_set, rule_number, attr)
                if rule_output == 1:
                     number_satisfied = 1
                else:
                     number_satisfied = 0
                attribute_list.append(number_satisfied)
            
        else:
            attribute_list = df_dataset[attribute].values
        df_classification = pandas.read_csv(classification_file)
        classification_list = df_classification['class'].values
        print(classification_list)
        print(attribute_list)
        information_gain_star = info_gain.info_gain(classification_list, attribute_list)
    else:
        attribute = rules.attributes(rule_set, rule_number)
        df_dataset = pandas.read_csv(dataset)
        user_id_list = list(set(df_dataset['user_id'].values))
        df_classification = pandas.read_csv(classification_file)
        attr_values = []
        real_classes = []
        for user_id in user_id_list:
            df_user = df_dataset.loc[df_dataset['user_id'] == user_id]
            attribute_list = df_user[attribute].values
            attribute_value = attribute_list[0]
            number_satisfied = 0
            if rule_number == 22:
                number_satisfied = len(df_user[attribute].unique())
            elif rule_number == 3 and rule_set == 'social_bakers':
                number_satisfied = df_user[attribute].value_counts().max()
            else:
                for attr in attribute_list:
                    rule_output = rules.rules(rule_set, rule_number, attr)
                    if rule_output == 1:
                        number_satisfied += 1
            attr_values.append(number_satisfied)
            df_class = df_classification.loc[df_classification['id'] == user_id]
            real_class = df_class['class'].values[0]
            real_classes.append(real_class)
        information_gain_star = info_gain.info_gain(real_classes, attr_values)
        
    return information_gain_star
def check_rule(rule_set, number, row):
    attribute = rules.attributes(rule_set, number)
    if isinstance(attribute, list):
        attribute_value = []
        for attr in attribute:
            attribute_value.append(row[attr])
    else:
        attribute_value = row[attribute]
    rule_output = rules.rules(rule_set, number, attribute_value)
    return rule_output
def check_rule(rule_set, number, rules_dict, row):
    attribute = rules.attributes(rule_set, number)
    if isinstance(attribute, list):
        attribute_value = []
        for attr in attribute:
            attribute_value.append(row[attr])
    else:
        attribute_value = row[attribute]
    rule_output = rules.rules(rule_set, number, attribute_value)
    if rule_output == 1:
        rule_index = 'rule_' + str(number)
        rules_dict[rule_index] = True
    return rules_dict
def check_vdb_rule(rule_set, number, row, df_row):
    rule_satisfied = 0
    attribute = rules.attributes(rule_set, number)
    if number == 4:
        attribute_value = []
        attribute_value.append(row[attribute])
        attribute_value.append(df_row[attribute])
    elif number == 3:
        attribute_value = df_row
    else:
        if isinstance(attribute, list):
            attribute_value = []
            for attr in attribute:
                attribute_value.append(row[attr])
        else:
            attribute_value = row[attribute]

    rule_output = rules.rules(rule_set, number, attribute_value)
    return rule_output
Пример #5
0
def calculate_pearson_correlation_coefficient_star(bas_dataset, classification_file, rule_set, rule_number):

    #calculate pearson correlation coefficient * for a non-numerical attribute of the users dataset
    if 'users' in bas_dataset and ((rule_number in [2,3,4,9] and rule_set == 'camsani_calzolari') or (rule_number in [1,4] and rule_set == 'van_den_beld')):
        attribute = rules.attributes(rule_set, rule_number)
        df_dataset = pandas.read_csv(bas_dataset)
        attribute_list = df_dataset[attribute].values
        attr_values = []
        for attr in attribute_list:
            if rule_number == 4:
                rule_output = rules.rules(rule_set, rule_number, [attr, df_dataset[attribute]])
            else:
                rule_output = rules.rules(rule_set, rule_number, attr)
            if rule_output == 1:
                number_satisfied = 1
            else:
                number_satisfied = 0
            attr_values.append(number_satisfied) 
        df_attr = pandas.DataFrame(attr_values, columns =[attribute])
        df_classification = pandas.read_csv(classification_file)
        df_class = df_classification['class']
        df_pcc_star = pandas.concat([df_attr, df_class], axis=1)
        pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson')

     #calculate pearson correlation coefficient * for a numerical attribute of the users dataset
    elif 'users' in bas_dataset:
        attribute = rules.attributes(rule_set, rule_number)
        df_dataset = pandas.read_csv(bas_dataset)
        df_attribute = df_dataset[attribute]
        df_numerical_attribute = df_attribute.fillna(0)   
        df_classification = pandas.read_csv(classification_file)
        df_class = df_classification['class']
        df_pcc_star = pandas.concat([df_numerical_attribute, df_class], axis=1)
        pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson')
    
        
    #calculate pearson correlation coefficient * for an attribute of the tweets dataset
    else:
        attribute = rules.attributes(rule_set, rule_number)
        df_dataset = pandas.read_csv(bas_dataset)
        user_id_list = list(set(df_dataset['user_id'].values))
        df_classification = pandas.read_csv(classification_file)
        attr_values = []
        real_classes = []
        for user_id in user_id_list:
            df_user = df_dataset.loc[df_dataset['user_id'] == user_id]
            attribute_list = df_user[attribute].values
            attribute_value = attribute_list[0]
            number_satisfied = 0
            if rule_number == 22:
                number_satisfied = len(df_user[attribute].unique())
            elif rule_number == 3 and rule_set == 'social_bakers':
                number_satisfied = df_user[attribute].value_counts().max()
            else:
                for attr in attribute_list:
                    rule_output = rules.rules(rule_set, rule_number, attr)
                    if rule_output == 1:
                        number_satisfied += 1
            attr_values.append(number_satisfied) 
            df_class = df_classification.loc[df_classification['id'] == user_id]
            real_class = df_class['class'].values
            real_classes.append(real_class)
                
        df_attr = pandas.DataFrame(attr_values, columns =[attribute])
        df_cl = pandas.DataFrame(real_classes, columns =['class'])
        df_pcc_star = pandas.concat([df_attr, df_cl], axis=1)
        pearson_correlation_coefficient_star = df_pcc_star.corr(method='pearson')
            
    return pearson_correlation_coefficient_star