def get_stats_pair(self, first_attr, second_attr): """ Returns a dictionary {first_val -> {second_val -> count } } where: <first_val>: all possible values for first_attr <second_val>: all values for second_attr that appear at least once with <first_val> <count>: frequency (# of entities) where first_attr=<first_val> AND second_attr=<second_val> Filters out NULL values so no entries in the dictionary would have NULLs. """ data_df = self.get_raw_data() tmp_df = data_df[[first_attr, second_attr]]\ .loc[(data_df[first_attr] != NULL_REPR) & (data_df[second_attr] != NULL_REPR)]\ .groupby([first_attr, second_attr])\ .size()\ .reset_index(name="count") return dictify_df(tmp_df)
def get_repaired_dataset(self): tic = time.clock() init_records = self.raw_data.df.sort_values(['_tid_' ]).to_records(index=False) t = self.aux_table[AuxTables.inf_values_dom] repaired_vals = dictify_df(t.df.reset_index()) for tid in repaired_vals: for attr in repaired_vals[tid]: init_records[tid][attr] = repaired_vals[tid][attr] repaired_df = pd.DataFrame.from_records(init_records) name = self.raw_data.name + '_repaired' self.repaired_data = Table(name, Source.DF, df=repaired_df) self.repaired_data.store_to_db(self.engine.engine) status = "DONE generating repaired dataset" toc = time.clock() total_time = toc - tic return status, total_time