Пример #1
0
 def get_stats_pair(self, first_attr, second_attr):
     """
     Returns a dictionary {first_val -> {second_val -> count } } where:
         <first_val>: all possible values for first_attr
         <second_val>: all values for second_attr that appear at least once with <first_val>
         <count>: frequency (# of entities) where first_attr=<first_val> AND second_attr=<second_val>
     Filters out NULL values so no entries in the dictionary would have NULLs.
     """
     data_df = self.get_raw_data()
     tmp_df = data_df[[first_attr, second_attr]]\
         .loc[(data_df[first_attr] != NULL_REPR) & (data_df[second_attr] != NULL_REPR)]\
         .groupby([first_attr, second_attr])\
         .size()\
         .reset_index(name="count")
     return dictify_df(tmp_df)
Пример #2
0
 def get_repaired_dataset(self):
     tic = time.clock()
     init_records = self.raw_data.df.sort_values(['_tid_'
                                                  ]).to_records(index=False)
     t = self.aux_table[AuxTables.inf_values_dom]
     repaired_vals = dictify_df(t.df.reset_index())
     for tid in repaired_vals:
         for attr in repaired_vals[tid]:
             init_records[tid][attr] = repaired_vals[tid][attr]
     repaired_df = pd.DataFrame.from_records(init_records)
     name = self.raw_data.name + '_repaired'
     self.repaired_data = Table(name, Source.DF, df=repaired_df)
     self.repaired_data.store_to_db(self.engine.engine)
     status = "DONE generating repaired dataset"
     toc = time.clock()
     total_time = toc - tic
     return status, total_time