Exemplo n.º 1
0
    def __init__(self, dataset, specs):
        self.dataset = dataset
        self.specs = json.load(open(specs, 'r'))
        domain_info = json.load(open('domain.json'))

        # check consistency for codebook information
        for col in list(domain_info):
            if domain_info[col][-1] < self.specs[col]['maxval']:
                print('Codebook inconsistent for', col)
                del domain_info[col]

        ## look at ground truth data to obtain possible values for state-dependent columns
        df = pd.read_csv(dataset)
        for col in ['SEA', 'METAREA', 'COUNTY', 'CITY', 'METAREAD']:
            domain_info[col] = sorted(df[col].unique())
        ## done using ground truth data 

        domain = { }
        for col in self.specs:
            if col in domain_info:
                domain[col] = len(domain_info[col])
            else:
                domain[col] = self.specs[col]['maxval'] + 1

        domain['INCWAGE_A'] = 52
        domain['INCWAGE_B'] = 8
        del domain['INCWAGE']
        #domain['INCWAGE'] = 5002
        domain['VALUEH'] = 5003
        
        self.domain_info = domain_info 
        self.domain = Domain.fromdict(domain)
Exemplo n.º 2
0
def reverse_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        mx = support.sum()
        newdom[col] = int(support.size)
        idx, extra = np.where(support)[0], np.where(~support)[0]
        mask = df[col] == mx
        if extra.size == 0:
            pass
        else:
            df.loc[mask, col] = np.random.choice(extra, mask.sum())
        df.loc[~mask, col] = idx[df.loc[~mask, col]]
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)
Exemplo n.º 3
0
    def postprocess(self):
        #use noisy measurements to fit PGM inference
        #and generate synthetic data
        iters = self.iters
        domain = self.domain
        temp_domain = Domain.fromdict(domain)
        engine = FactoredInference(temp_domain,
                                   structural_zeros=None,
                                   iters=10000,
                                   log=True,
                                   warm_start=False,
                                   elim_order=self.elimination_order)
        self.engine = engine
        engine.estimate(self.measurements)

        self.synthetic = self.engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
Exemplo n.º 4
0
def transform_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        size = support.sum()
        newdom[col] = int(size)
        if size < support.size:
            newdom[col] += 1
        mapping = {}
        idx = 0
        for i in range(support.size):
            mapping[i] = size
            if support[i]:
                mapping[i] = idx
                idx += 1
        assert idx == size
        df[col] = df[col].map(mapping)
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)