def preprocess_before_running_model(source_file_, facts_file_, confidence_value_computation_info_dir_, dataitem_index_file_, g_): # load source information header = False # dictionary with the original trustworthiness T_actual_ = utils_dataset.load_sources_info(source_file_, header) T_ = utils_dataset.load_sources_info(source_file_, header) print(str(len(T_)) + " sources loaded") # load fact information header = True sources_dataItemValues_ = utils_dataset.load_facts(facts_file_, header) # load data item set D_ = list(sources_dataItemValues_.keys()) # compute (1) all the facts that are claimed by a source and (2) all the sources that claim a specific fact # (1) set of facts that are claimed by a specific source < key = source id, value = set of facts (dataitem + value) > # (2) all the sources that claim a specific fact <key = dataitem + value, value = set of source ids> print("Fact loading") fact_and_source_info_ = utils_dataset.load_fact_and_source_info( sources_dataItemValues_) F_s_ = fact_and_source_info_[0] S_ = fact_and_source_info_[1] print("Computing sources for " + str(len(sources_dataItemValues_)) + " data items FOR COMPUTATION PURPOSE") if not (len(os.listdir(confidence_value_computation_info_dir_)) == len(D_)): # compute the files for belief propagation information print("graph nodes " + str(len(g_.nodes))) print("LENGH source data item values" + str(len(sources_dataItemValues_.values()))) res = utils_taxonomy.create_value_info_computation( g_, sources_dataItemValues_, dataitem_index_file_, confidence_value_computation_info_dir_) sources_dataItemValues_.clear() header = True sources_dataItemValues_ = utils_dataset.load_facts(facts_file_, header) if res: print("Computation DONE") # else: the files for contained the info for the belief propagation have been already computed # then load the relative dataitem id for using the files dataitem_ids_ = utils_dataset.load_dataitem_ids(dataitem_index_file_) # load the information dataitem_values_info_ = utils_dataset.load_all_dataitem_values_confidence_infos_low_memory( dataitem_ids_, confidence_value_computation_info_dir_, sources_dataItemValues_) # S_prop is a dictionary contained for each fact all the sources that it has to take into account for leveraging the belief propagation framework S_prop_ = dataitem_values_info_[2] app_conf_dict_ = dataitem_values_info_[3] app_source_dict_ = dataitem_values_info_[4] # delete folder confidence_value_computation_info_dir_ and the relative index file #shutil.rmtree(confidence_value_computation_info_dir_) # os.remove(dataitem_index_file_) return [ T_, T_actual_, sources_dataItemValues_, D_, F_s_, S_, S_prop_, app_conf_dict_, app_source_dict_ ]
def preprocess_before_running_model_only_trad(source_file_, facts_file_): # load source information header = False # dictionary with the original trustworthiness T_ = utils_dataset.load_sources_info(source_file_, header) print(str(len(T_)) + " sources loaded") # load fact information header = True sources_dataItemValues_ = utils_dataset.load_facts(facts_file_, header) # load data item set # compute (1) all the facts that are claimed by a source and (2) all the sources that claim a specific fact # (1) set of facts that are claimed by a specific source < key = source id, value = set of facts (dataitem + value) > # (2) all the sources that claim a specific fact <key = dataitem + value, value = set of source ids> print("Fact loading") fact_and_source_info_ = utils_dataset.load_fact_and_source_info( sources_dataItemValues_) F_s_ = fact_and_source_info_[0] S_ = fact_and_source_info_[1] sources_dataItemValues_.clear() header = True sources_dataItemValues_ = utils_dataset.load_facts(facts_file_, header) return [T_, sources_dataItemValues_, F_s_, S_]
def analysis_trustworthiness_estimations(dataset_dir, dir_path, output_error_rate_dir): if not os.path.exists(output_error_rate_dir): os.makedirs(output_error_rate_dir) for root, dirs, files in os.walk(dir_path): if ["EXP", "LOW_E", "UNI"] == dirs: continue if ["EXP"] == dirs: continue if ["UNI"] == dirs: continue if ["LOW_E"] == dirs: continue if root.count('/') != 3: continue for dir_name in dirs: #print("dir name :" + str(dir_name)) dir_name = dir_name.replace("dataset", "") n_dataset = dir_name if "-" in n_dataset: n_dataset = n_dataset.replace("UNI-", "") n_dataset = n_dataset.replace("EXP-", "") n_dataset = n_dataset.replace("LOW_E-", "") n_dataset = n_dataset.replace("-", "") n_folder_app = "-" + str(n_dataset[0:2]) n_dataset = n_dataset[2:] n_folder_app = n_folder_app + n_dataset else: n_dataset = n_dataset.replace("UNI_", "") n_dataset = n_dataset.replace("EXP_", "") n_dataset = n_dataset.replace("LOW_E_", "") n_dataset = n_dataset.replace("_", "") n_folder_app = "_" n_folder_app = n_folder_app + n_dataset # facts file path if "UNI" in root: subfolder_path_results = "UNI/" subfolder_path = "UNI/dataset" + str(n_folder_app) + "/" source_file = dataset_dir + "/UNI/dataset" + str( n_folder_app) + "/Output_acc_" + str(n_dataset) + ".txt" if "EXP" in root: subfolder_path_results = "EXP/" subfolder_path = "EXP/dataset" + str(n_folder_app) + "/" source_file = dataset_dir + "/EXP/dataset" + str( n_folder_app) + "/Output_acc_" + str(n_dataset) + ".txt" if "LOW_E" in root: subfolder_path_results = "LOW_E/" subfolder_path = "LOW_E/dataset" + str(n_folder_app) + "/" source_file = dataset_dir + "/LOW_E/dataset" + str( n_folder_app) + "/Output_acc_" + str(n_dataset) + ".txt" output_estimations = os.path.join(output_estimations_main_dir, subfolder_path) output_error_rate = os.path.join(output_error_rate_dir, subfolder_path_results) if not os.path.exists(output_error_rate): os.makedirs(output_error_rate) output_file_path = output_error_rate + "/trust_error_rate_dataset_" + str( n_folder_app) + ".csv" header = False # dictionary with the original trustworthiness T_actual = utils_dataset.load_sources_info(source_file, header) trust_file_trad = os.path.join( output_estimations, "trad_est_trust_" + str(n_dataset) + ".csv") trust_trad = utils_dataset.read_trust_estimation_file( trust_file_trad) error_rate_trad = compute_error_rate_for_trustworthiness( T_actual, trust_trad) # adapted model 1 (only confidence) results files and dir trust_file_adapt = os.path.join( output_estimations, "adapt_est_trust_" + str(n_dataset) + ".csv") trust_adapt = utils_dataset.read_trust_estimation_file( trust_file_adapt) error_rate_adapt = compute_error_rate_for_trustworthiness( T_actual, trust_adapt) utils_writing_results.writing_trustworthiness_error_rate_file( output_file_path, error_rate_trad, error_rate_adapt) return True
if AdaptedSums_and_Rules_flag: res_list = preprocessing_sums_model.preprocess_before_running_model(source_file, facts_file, confidence_value_computation_info_dir, dataitem_index_file, g) T = res_list[0] T_actual = res_list[1] sources_dataItemValues = res_list[2] D = res_list[3] F_s = res_list[4] S = res_list[5] S_prop = res_list[6] app_conf_dict = res_list[7] header = False # original trustworthiness file T_actual = utils_dataset.load_sources_info(source_file, header) header = False # load trustworthiness of sources T = utils_dataset.load_sources_info(source_file, header) print(str(len(T)) + " sources loaded") S_set = list(T.keys()) # load facts header = True sources_dataItemValues = utils_dataset.load_facts(facts_file, header) D = list(sources_dataItemValues.keys()) # compute (1) all the facts that are claimed by a source and (2) all the sources that claim a specific fact # (1) set of facts that are claimed by a specific source < key = source id, value = set of facts (dataitem + value) > # (2) all the sources that claim a specific fact <key = dataitem + value, value = set of source ids> print("Fact loading")