def _execute(self): df = dicts_to_df(self.features) format_feature_df(df) df_real = df[df["Model"] == "real-world"] print(collections.Counter(df["Model"])) small_avg_degree = df_real[ "Centrality.Degree.Location.Arithmetic Mean"] <= 30 filters = { "all": [True] * len(df_real), "avg-degree-le-30": small_avg_degree, "avg-degree-gt-30": ~small_avg_degree, "socfb": df_real["Type"] == "socfb", "not-socfb": df_real["Type"] != "socfb" } format_str = "{:20}{:>5}" network_models = sorted( set( filter(lambda model: not model.endswith("-second"), set(df["Model"]))) - set(["real-world"])) for filtername, filterdf in sorted(filters.items()): graphs = sorted(df_real[filterdf]["Graph"]) print(format_str.format(filtername, len(graphs))) features_collection = get_all_feature_sets_self_check(df, graphs) sub_df = df.loc(axis=0)[:, graphs, :] accuracies = \ classification_experiment( sub_df, network_models, features_collection, self.cores) accuracies.to_csv(self._stagepath + "accuracies/" + filtername + ".csv", header=True, index_label="features")
def _execute(self): df = dicts_to_df(self.features) df.sort_index(axis=1, inplace=True) format_feature_df(df) network_models = sorted(set(filter(lambda model: not model.endswith("-second"), set(df["Model"])))-set(["real-world"])) diff_features = df.columns[df.dtypes == float].values print("Calculating difference for {} features for {} models...".format(len(diff_features), len(network_models))) idx = pandas.IndexSlice for model in network_models: val_1 = df.loc[idx[:,:,model],diff_features].values val_2 = df.loc[idx[:,:,model+"-second"],diff_features].values df.loc[idx[:,:,model],diff_features] = val_1 - val_2 df.loc[idx[:,:,model+"-second"],diff_features] = val_2 - val_1 print("Done with model {}".format(model)) for a_dict in df.to_dict("records"): self._save_as_csv(a_dict)
def _execute(self): df = dicts_to_df(self.features) df.sort_index(axis=1, inplace=True) # fill information df.columns.name = "Feature" df.set_index("Graph", inplace=True) df['Info'] = df['Info'].fillna("no info") object_cols = df.columns[df.dtypes == numpy.object] df[object_cols] = df[object_cols].astype(str) # clean features valid_cols = [ col for col in df.axes[1] if not any(i in col for i in [ "Binning", "Interquartile Range", "Sample Range", "Bessel's Correction" ]) ] df_features_cleaned = df[valid_cols] features = df_features_cleaned.columns print(features.name + ":", len(features), "( unfiltered:", len(df.columns), ")") # clean missing models df_real = df_features_cleaned[df_features_cleaned["Model"] == "real-world"] real_graphs = set(df_real.index) complete_graphs = real_graphs.copy() for model in set(df_features_cleaned["Model"]) - set("real-world"): graphs_for_model = set(df_features_cleaned[ df_features_cleaned["Model"] == model].index) if graphs_for_model != real_graphs: print("missing graphs for", model, "model:", real_graphs - graphs_for_model) complete_graphs &= graphs_for_model df_real = df_real.loc[complete_graphs] df_cleaned = df_features_cleaned.loc[complete_graphs] print(df_cleaned.index.name + ":", len(df_cleaned.index), "( unfiltered:", len(df_features_cleaned.index), ")") # clean with filter rules df_real = df_cleaned[df_cleaned["Model"] == "real-world"] filters = { "CC = 0": df_real[ "Centrality.ClusteringCoefficient.Location.Arithmetic Mean"] == 0, "edges < 50": df_real["Edges"] < 500, "nodes < 100": df_real["Nodes"] < 100 } all_filters = reduce(lambda x, y: x | y, filters.values()) format = "{:15}{:>5}" sep = "-" * 20 print() print("Filter graphs with rules:") print(format.format("total", len(df_real))) print(sep) for filtername, filterdf in sorted(filters.items()): print(format.format(filtername, filterdf.sum()), list(df_real.index[filterdf])) print(sep) print(format.format("to filter", all_filters.sum()), list(df_real.index[all_filters])) print(sep) df_rule_filtered = df_cleaned.loc[df_real[~all_filters].index].copy() print(format.format("total filtered", len(df_real[~all_filters]))) assert (len(df_real[~all_filters]) == len(df_rule_filtered) / len(set(df_rule_filtered["Model"]))) # originally weighted print() print( "filter originally weighted graphs", df_rule_filtered[ df_rule_filtered["Originally Weighted"]].index.tolist()) df_rule_filtered.drop("Originally Weighted", axis=1, inplace=True) # remove feature # should only be one giant single component single_connected_component = df_rule_filtered[ "Partition.ConnectedComponents.Properties.Size"] == 1 assert numpy.all(single_connected_component) # clean notfinite features def notfinite(a_df): return a_df.isnull() | (a_df == float("inf")) | (a_df == -float("inf")) valid_cols = set(df_rule_filtered.columns) nans = numpy.where(notfinite(df_rule_filtered)) nan_cols = set(df_rule_filtered.columns[nans[1]]) print("filtering cols (notfinite):") for col in sorted(nan_cols): print(" " + col) df_finite_filtered = df_rule_filtered[list(valid_cols - nan_cols)] assert not numpy.any(notfinite(df_finite_filtered)) valid_cols = set(df_finite_filtered.columns) # normalized coefficient of variation: variation = df_finite_filtered.std() / df_finite_filtered.mean() / ( len(df_finite_filtered) - 1)**0.5 low_variation_cols = set(variation[(variation < 0.01) | variation.isnull()].index) print("filtering cols (low variation):") for col in sorted(low_variation_cols): print(" " + col) df_final = df_finite_filtered[list(valid_cols - low_variation_cols)] df_final["Graph"] = df_final.index for a_dict in df_final.to_dict("records"): self._save_as_csv(a_dict)