def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # To compare two data sets, make sure that they have same columns. If # not, compare them on their common columns. cols = set(first.columns) & set(second.columns) if len(cols) != len(first.columns) or len(cols) != len(second.columns): warnings.warn("Evaluate on partial columns of the datasets", stacklevel=2) categories = [] if categories is None else categories self.fst = DataSet(first[cols], categories=categories) self.snd = DataSet(second[cols], categories=categories) # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in cols.copy(): # If current column is not categorical, will ignore it. if not self.fst[col].categorical or not self.snd[col].categorical: continue fst_domain, snd_domain = self.fst[col].domain, self.snd[col].domain if not np.array_equal(fst_domain, snd_domain): # if there is no intersection of two domains, then there may be # zero relationship between the columns. if len(np.intersect1d(fst_domain, snd_domain)) == 0: self.fst = self.fst.drop(col, axis=1) self.snd = self.snd.drop(col, axis=1) cols.remove(col) continue if self.fst[col].categorical: domain = np.unique(np.concatenate( (fst_domain, snd_domain))) else: domain = [ min(fst_domain[0], snd_domain[0]), max(fst_domain[1], snd_domain[1]) ] self.fst[col].domain = domain self.snd[col].domain = domain self._columns = sorted(cols)
def test_encode_empty_column(): from numpy import array_equal data = [[1001, 'A', 'Female'], [1002, 'B', 'Male'], [1003, 'C', 'Male'], [1004, 'D', 'Female'], [1005, 'E', 'Female']] ds = DataSet(data, columns=['ID', 'Name', 'Sex']) x = DataFrame(data[-2:], columns=['ID', 'Name', 'Sex']) x_tf = ds.encode(data=x) # Name is not categorical, because it has unique values assert x_tf.shape == (2, 3) assert array_equal(x_tf.columns, ['ID', 'Sex_Female', 'Sex_Male'])
def test_encode_partly(): from .testdata import adults01 from sklearn.model_selection import train_test_split dataset = DataSet(adults01) train, test = train_test_split(adults01, test_size=0.2) frame = dataset.encode(data=train) assert 'salary_<=50K' in frame.columns assert 'salary_>50K' in frame.columns assert ((0 == frame['salary_<=50K']) | (frame['salary_<=50K'] == 1)).all() assert ((0.0 <= frame['age']) & (frame['age'] <= 1.0)).all()
def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # distribution self._dt = {} # To compare two data set, make sure that they have same columns. # If not, compare the common part. common = set(first.columns) & set(second.columns) if len(common) != len(first.columns) or len(common) != len( second.columns): logger.info(f"BiFrame constructed on attributes: {common}.") # left and right data set (ds) self.first = DataSet(first[common], categories=categories) self.second = DataSet(second[common], categories=categories) self._columns = self.first.columns.sort_values().to_list() # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in self._columns: # If current column is not categorical, will ignore it. if not self.first[col].categorical or not self.second[ col].categorical: continue d1, d2 = self.first[col].domain, self.second[col].domain if not np.array_equal(d1, d2): if self.first[col].categorical: domain = np.unique(np.concatenate((d1, d2))) else: domain = [min(d1[0], d2[0]), max(d1[1], d2[1])] self.first[col].domain = domain self.second[col].domain = domain
def test_encode(): from .testdata import adults01 from numpy import array_equal dataset = DataSet(adults01) frame = dataset.encode() for col in ['education', 'relationship', 'salary']: assert col not in frame.columns for col in ['age', 'birth']: assert col in frame.columns assert 'salary_<=50K' in frame.columns assert 'salary_>50K' in frame.columns for attr, val in [('salary', '<=50K'), ('relationship', 'Wife'), ('relationship', 'Husband')]: trans_col = frame[f'{attr}_{val}'].apply(lambda v: v == 1) origin_col = adults01[attr] == val assert array_equal(trans_col, origin_col)
def test_split_feature_class(): frame = DataSet(adults01[['age', 'relationship', 'salary']].head(10)).encode() features1, class1 = split_feature_class('birth', frame) assert features1.equals(frame) assert class1 is None features2, class2 = split_feature_class('age', frame) assert features2.equals(frame) assert class2 is None features3, class3 = split_feature_class('salary', frame) assert len(features3.columns) == 4 assert class3.name == 'salary_>50K' features4, class4 = split_feature_class('relationship', frame) assert len(features4.columns) == 3 assert class4.min() == 0 assert class4.max() == 2
def test_svm_task(): from sklearn.svm import SVC from sklearn.model_selection import train_test_split from .testdata import adults01 c_df = DataFrame(adults01) c_tf = DataSet(c_df).encode() train, test = train_test_split(c_tf, test_size=0.2) def make_train_x_y(df): x_ = df.drop(['salary_<=50K', 'salary_>50K'], axis=1) # <=50K and >50K are binary, complementary _, ym_ = df['salary_<=50K'], df['salary_>50K'] return x_, ym_ tr_x, tr_y = make_train_x_y(train) te_x, te_y = make_train_x_y(test) clf = SVC(gamma='scale') clf.fit(tr_x, tr_y) pr_y = clf.predict(te_x) from sklearn.metrics import confusion_matrix, classification_report print(confusion_matrix(te_y, pr_y)) print(classification_report(te_y, pr_y))
def test_synthesize_for_privacy(): # Verify probability after synthesis by differential privacy. (This test # case may fail because of limit runs.) from numpy.random import randint from numpy import exp epsilon = 0.1 runs = 200 data = randint(65, 90, size=(199, 2)) set1 = DataSet(data.tolist() + [[65, 65]], columns=['ColA', 'ColB']) set2 = DataSet(data.tolist() + [[65, 66]], columns=['ColA', 'ColB']) counts = [0, 0] for i in range(runs): df1 = set1.synthesize(epsilon=epsilon) df2 = set2.synthesize(epsilon=epsilon) counts[0] += ((df1['ColA'] == 65) & (df1['ColB'] == 65)).sum() counts[1] += ((df2['ColA'] == 65) & (df2['ColB'] == 66)).sum() assert counts[0] / (runs * 200) <= exp(epsilon) * counts[1] / (runs * 200)
def test_synthesize_with_retains(): dataset = DataSet(adults01) df = dataset.synthesize(retains=['age']) assert df.size == dataset.size assert array_equal(dataset['age'], df['age'])
def test_synthesize_with_pseudonyms(): dataset = DataSet(adults01) df = dataset.synthesize(pseudonyms=['salary']) assert df.size == dataset.size assert array_equal(dataset['salary'].value_counts().values, df['salary'].value_counts().values)
def test_synthesize(): dataset = DataSet(adults01) df = dataset.synthesize() assert df.size == dataset.size
class BiFrame(object): def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # distribution self._dt = {} # To compare two data set, make sure that they have same columns. # If not, compare the common part. common = set(first.columns) & set(second.columns) if len(common) != len(first.columns) or len(common) != len( second.columns): logger.info(f"BiFrame constructed on attributes: {common}.") # left and right data set (ds) self.first = DataSet(first[common], categories=categories) self.second = DataSet(second[common], categories=categories) self._columns = self.first.columns.sort_values().to_list() # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in self._columns: # If current column is not categorical, will ignore it. if not self.first[col].categorical or not self.second[ col].categorical: continue d1, d2 = self.first[col].domain, self.second[col].domain if not np.array_equal(d1, d2): if self.first[col].categorical: domain = np.unique(np.concatenate((d1, d2))) else: domain = [min(d1[0], d2[0]), max(d1[1], d2[1])] self.first[col].domain = domain self.second[col].domain = domain @property def columns(self): return self._columns def err(self): """ Return pairwise err (relative error) of columns' distribution. """ # merge two frequency counts, and calculate relative difference df = pd.DataFrame(columns=self._columns, index=['err']) df.fillna(0) for col in self._columns: df.at['err', col] = relative_error(self.first[col].counts(), self.second[col].counts()) return df def jsd(self): """ Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution. """ df = pd.DataFrame(columns=self._columns, index=['jsd']) df.fillna(0) for col in self._columns: df.at['jsd', col] = jensen_shannon_divergence(self.first[col].counts(), self.second[col].counts()) return df def corr(self): """ Return pairwise correlation and dependence measured by mi (mutual information). """ return self.first.mi(), self.second.mi() def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if len(self._dt) == 0: for c in self._columns: self._dt[c] = {} if self.first[c].categorical: bins = self.first[c].domain counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) else: min_, max_ = self.first[c].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) counts1 = self.first[c].counts(bins=bins) counts2 = self.second[c].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] self._dt[c]['bins'] = bins # stack arrays vertically self._dt[c]['counts'] = np.vstack((counts1, counts2)) return self._dt[column]['bins'], self._dt[column]['counts'] def describe(self): """ Give descriptive difference between two data sets, which concluded relative errors, and jsd divergence. Return a panda.DataFrame, whose columns are two dataset's columns, and indexes are a array of metrics, e.g. ['err', 'jsd']. """ df1 = self.err() df2 = self.jsd() return pd.concat([df1, df2]) def classify(self, label: str, test: pd.DataFrame = None): """ Train two svm classifiers based on data sets, and predict class labels for test data. Return both error rates. Parameters ---------- label : str classifier feature, key is one column in left data frame. It supports two-class and multi-class. test : {pandas.DataFrame} test frame, is test data for machine learning algorithms. If it is not provided, it will split 20% of left data frame as test data. Returns ------- a DataFrame, e.g. target source target male female male female male female source male 1 3 or actual male 1 3 1 2 female 2 4 female 2 4 3 4 """ if (not self.first[label].categorical or not self.second[label].categorical): raise ValueError(f'Classifier can not run on non-categorical ' f'column: {label}') from sklearn.metrics import confusion_matrix def split_feature_label(df: pd.DataFrame): # TODO need improve sub_cols sub_cols = [attr for attr in df.columns if attr.startswith(label)] if len(sub_cols) == 0: return df, None is_one_class = len(sub_cols) == 2 if is_one_class: # For one class, there are two sorted values. # e.g. ['Yes', 'No'] => [[0, 1], # [1, 0]] # Choose second column to represent this attribute. label_ = sub_cols[1] return df.drop(sub_cols, axis=1), df[label_] else: try: # merge multiple columns into one column: # [Name_A, Name_B, ..] => Name _y = df[sub_cols].apply(lambda x: Index(x).get_loc(1), axis=1) return df.drop(sub_cols, axis=1), _y except KeyError as e: print(e) print(sub_cols) print(df[sub_cols]) # If test dataset is not provided, then split 20% of original dataset # for testing. if test is None: fst_train, test = train_test_split(self.first, test_size=0.2) snd_train, _ = train_test_split(self.second, test_size=0.2) else: fst_train = self.first snd_train = self.second # ts = self.first.encode(data=fst_train) fst_train_x, fst_train_y = split_feature_label( self.first.encode(data=fst_train)) test_x, test_y = split_feature_label(self.first.encode(data=test)) snd_train_x, snd_train_y = split_feature_label( self.first.encode(data=snd_train)) # construct svm classifier, and predict on the same test dataset fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x) snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x) columns = self.first[label].bins labels = range(len(columns)) # If test dataset has the columns as class label for prediction, return # two expected scores: (self.first) original dataset's and (self.second) # anonymized dataset's confusion matrix. if label in test: fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels) snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels) # normalize the confusion matrix # fst_matrix = fst_matrix.astype('float') / fst_matrix.sum(axis=1) # snd_matrix = snd_matrix.astype('float') / snd_matrix.sum(axis=1) return (pd.DataFrame(fst_matrix, columns=columns, index=columns), pd.DataFrame(snd_matrix, columns=columns, index=columns)) # If test dataset does not have the class label for prediction, return # their predicted values. else: matrix = confusion_matrix(fst_predict_y, snd_predict_y, labels=labels) return pd.DataFrame(matrix, columns=columns, index=columns) def to_html(self, buf=None, title='Evaluation Report', info=True, distribute=True, correlate=True, classifier=None, labels=None, test=None): """ Render the evaluation result of two data set as an HTML file. Parameters ---------- buf : optional buffer to write to title : str title of evaluation report info : bool, default true show basic information of two data set, including relative error, and Jensen-Shannon divergence (jsd). distribute : bool, default true show distribution of each attribute. correlate : bool, default true show correlation of pair-wise attributes. classifier : str use classifier to train data set on one or more columns (defined by parameter 'label') and show prediction result on the evaluation report. Optional classifier: SVM. labels : list of column names column name, or a list of column names separated by comma, used for classification task. test : pd.DataFrame test data for classification, and other machine learning tasks. """ from ds4ml.utils import (plot_histogram, plot_heatmap, plot_confusion_matrix) from mako.template import Template import os old_cwd = os.getcwd() os.chdir(os.path.dirname(__file__)) template = Template(filename='template/report.html') os.chdir(old_cwd) topics = [] content = {} # format different kinds of evaluation result to unified style if info: topics.append('basic') content['basic'] = [self.describe().to_dict('split')] if distribute: topics.append('dist') content['dist'] = [] for col in self.columns: bins, counts = self.dist(col) svg = plot_histogram(bins, counts) content['dist'].append({ 'name': col, 'columns': bins, 'data': counts, 'path': svg }) if correlate: topics.append('corr') content['corr'] = [] source_mi, target_mi = self.corr() source_svg = plot_heatmap(source_mi) target_svg = plot_heatmap(target_mi) content['corr'].append({ 'matrix': source_mi.to_dict('split'), 'path': source_svg }) content['corr'].append({ 'matrix': target_mi.to_dict('split'), 'path': target_svg }) if labels is not None: topics.append('svm') content['svm'] = [] for col in labels: in_test = (test is not None and col in test) or (test is None) if in_test: # When class label in svm classify test data, try to match # two predicted result with the actual data, and so, there # will be two confusion matrix diagrams. try: source_cm, target_cm = self.classify(col, test=test) vrange = (min(source_cm.values.min(), target_cm.values.min()), max(source_cm.values.max(), target_cm.values.max())) path = (plot_confusion_matrix(source_cm, vrange=vrange, xlabel='raw', ylabel='actual'), plot_confusion_matrix(target_cm, vrange=vrange, xlabel='synth', ylabel='actual')) content['svm'].append({'column': col, 'path': path}) except ValueError as e: print(e) else: # If not, will compare two predicted result. try: cm = self.classify(col, test=test) # make path's type: 1-tuple path = (plot_confusion_matrix(cm, xlabel='synth', ylabel='raw'), ) content['svm'].append({'column': col, 'path': path}) except ValueError as e: print(e) svms = content['svm'] if 'svm' in content else [] if buf: with open(buf, 'w+', encoding='utf-8') as file: file.write( template.render(title=title, basics=content['basic'], dists=content['dist'], corrs=content['corr'], svms=svms))
class BiFrame: def __init__(self, first: pd.DataFrame, second: pd.DataFrame, categories=None): """ BiFrame class that contains two data sets, which currently provides kinds of analysis methods from distribution, correlation, and some machine learning tasks. Especially, if the input data sets are source and synthesized dataset, this class can be used to evaluate the utility and privacy of synthesized data set. Parameters ---------- first : {pandas.DataFrame} first data set (i.e. original dataset) second : {pandas.DataFrame} second data set (i.e. synthesized dataset) categories : list of columns Column names whose values are categorical. """ # To compare two data sets, make sure that they have same columns. If # not, compare them on their common columns. cols = set(first.columns) & set(second.columns) if len(cols) != len(first.columns) or len(cols) != len(second.columns): warnings.warn("Evaluate on partial columns of the datasets", stacklevel=2) categories = [] if categories is None else categories self.fst = DataSet(first[cols], categories=categories) self.snd = DataSet(second[cols], categories=categories) # Make sure that two dataset have same domain for categorical # attributes, and same min, max values for numerical attributes. for col in cols.copy(): # If current column is not categorical, will ignore it. if not self.fst[col].categorical or not self.snd[col].categorical: continue fst_domain, snd_domain = self.fst[col].domain, self.snd[col].domain if not np.array_equal(fst_domain, snd_domain): # if there is no intersection of two domains, then there may be # zero relationship between the columns. if len(np.intersect1d(fst_domain, snd_domain)) == 0: self.fst = self.fst.drop(col, axis=1) self.snd = self.snd.drop(col, axis=1) cols.remove(col) continue if self.fst[col].categorical: domain = np.unique(np.concatenate( (fst_domain, snd_domain))) else: domain = [ min(fst_domain[0], snd_domain[0]), max(fst_domain[1], snd_domain[1]) ] self.fst[col].domain = domain self.snd[col].domain = domain self._columns = sorted(cols) @property def columns(self): """ Return the common columns of two datasets. """ return self._columns def err(self): """ Return pairwise err (relative error) of columns' distribution. """ # merge two frequency counts, and calculate relative difference frame = pd.DataFrame(columns=self.columns, index=['err']) frame.fillna(0) for col in self.columns: frame.at['err', col] = relative_error(self.fst[col].counts(), self.snd[col].counts()) return frame def jsd(self): """ Return pairwise JSD (Jensen-Shannon divergence) of columns' distribution. """ frame = pd.DataFrame(columns=self.columns, index=['jsd']) frame.fillna(0) for col in self.columns: frame.at['jsd', col] = jensen_shannon_divergence(self.fst[col].counts(), self.snd[col].counts()) return frame def corr(self): """ Return pairwise correlation and dependence measured by mi (mutual information). """ return self.fst.mi(), self.snd.mi() def dist(self, column): """ Return frequency distribution of one column. Parameters ---------- column : str column name, whose distribution will be return """ if column not in self.columns: raise ValueError(f"{column} is not in current dataset.") if self.fst[column].categorical: bins = self.fst[column].domain fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) else: min_, max_ = self.fst[column].domain # the domain from two data set are same; # extend the domain to human-readable range bins = normalize_range(min_, max_ + 1) fst_counts = self.fst[column].counts(bins=bins) snd_counts = self.snd[column].counts(bins=bins) # Note: index, value of np.histogram has different length bins = bins[:-1] # stack arrays vertically return bins, np.vstack((fst_counts, snd_counts)) def describe(self): """ Give descriptive difference between two data sets, which concluded relative errors, and jsd divergence. Return a panda.DataFrame, whose columns are two dataset's columns, and indexes are a array of metrics, e.g. ['err', 'jsd']. """ err_frame = self.err() jsd_frame = self.jsd() return pd.concat([err_frame, jsd_frame]) def classify(self, label: str, test: pd.DataFrame = None): """ Train two svm classifiers based on data sets, and predict class labels for test data. Return both error rates. Parameters ---------- label : str classifier feature, key is one column in left data frame. It supports two-class and multi-class. test : {pandas.DataFrame} test frame, is test data for machine learning algorithms. If it is not provided, it will split 20% of left data frame as test data. Returns ------- a DataFrame, e.g. target source target male female male female male female source male 1 3 or actual male 1 3 1 2 female 2 4 female 2 4 3 4 """ if not self.fst[label].categorical or not self.snd[label].categorical: raise ValueError(f'Must classify on categorical column') # If test dataset is not provided, then split 20% of original dataset # for testing. if test is None: fst_train, test = train_test_split(self.fst, test_size=0.2) snd_train, _ = train_test_split(self.snd, test_size=0.2) else: fst_train = self.fst snd_train = self.snd fst_train_x, fst_train_y = split_feature_class( label, self.fst.encode(data=fst_train)) snd_train_x, snd_train_y = split_feature_class( label, self.fst.encode(data=snd_train)) test_x, test_y = split_feature_class(label, self.fst.encode(data=test)) # construct svm classifier, and predict on the same test dataset fst_predict_y = train_and_predict(fst_train_x, fst_train_y, test_x) snd_predict_y = train_and_predict(snd_train_x, snd_train_y, test_x) columns = self.fst[label].bins labels = range(len(columns)) # If test dataset has the columns as class label for prediction, return # two expected scores: (self.fst) original dataset's and (self.snd) # synthesized dataset's confusion matrix. if label in test: fst_matrix = confusion_matrix(test_y, fst_predict_y, labels=labels) snd_matrix = confusion_matrix(test_y, snd_predict_y, labels=labels) return (pd.DataFrame(fst_matrix, columns=columns, index=columns), pd.DataFrame(snd_matrix, columns=columns, index=columns)) # If test dataset does not have the class label for prediction, return # their predicted values. matrix = confusion_matrix(fst_predict_y, snd_predict_y, labels=labels) return pd.DataFrame(matrix, columns=columns, index=columns) def to_html(self, buffer, title='Evaluation Report', labels=None, test=None): """ Render the evaluation result of two datasets to an HTML file. The result contains: + basic information of two data set (relative error, and Jensen-Shannon divergence (jsd)); + distribution of each attribute; + correlation of pair-wise attributes; + classification result by SVM to train data set on one or more columns (defined by parameter 'labels' and 'test' dataset). Parameters ---------- buffer buffer to write to title : str title of evaluation report labels : list of column names column name, or a list of column names separated by comma, used for classification task. test : pd.DataFrame test data for classification, and other machine learning tasks. """ basics = [self.describe().to_dict('split')] svms = self._get_svm_classifier(labels=labels, test=test) template = BiFrame._construct_template() with open(buffer, 'w+', encoding='utf-8') as file: file.write( template.render(title=title, basics=basics, dists=self._get_dist(), corrs=self._get_corr(), svms=svms)) def _get_svm_classifier(self, labels=None, test=None): if labels is None: return [] from ds4ml.utils import plot_confusion_matrix svms = [] for col in labels: in_test = (test is not None and col in test) or (test is None) if in_test: # When class label in svm classify test data, try to match # two predicted result with the actual data, and so, there # will be two confusion matrix diagrams. src_matrix, tgt_matrix = self.classify(col, test=test) vrange = (min(src_matrix.values.min(), tgt_matrix.values.min()), max(src_matrix.values.max(), tgt_matrix.values.max())) path = (plot_confusion_matrix(src_matrix, vrange=vrange, xlabel='raw', ylabel='actual'), plot_confusion_matrix(tgt_matrix, vrange=vrange, xlabel='synth', ylabel='actual')) svms.append({'column': col, 'path': path}) else: # If not, will compare two predicted result. matrix = self.classify(col, test=test) # make path's type: 1-tuple path = (plot_confusion_matrix(matrix, xlabel='synth', ylabel='raw')) svms.append({'column': col, 'path': path}) return svms @staticmethod def _construct_template(): """ construct template from a html """ from mako.template import Template import os old_cwd = os.getcwd() os.chdir(os.path.dirname(__file__)) template = Template(filename='template/report.html') os.chdir(old_cwd) return template def _get_dist(self): """ return the distribution information """ from ds4ml.utils import plot_histogram dists = [] for col in self.columns: bins, counts = self.dist(col) svg = plot_histogram(bins, counts) dists.append({ 'name': col, 'columns': bins, 'data': counts, 'path': svg }) return dists def _get_corr(self): """ return the pair-wise correlation """ from ds4ml.utils import plot_heatmap corrs = [] fst_mi, snd_mi = self.corr() fst_svg = plot_heatmap(fst_mi) snd_svg = plot_heatmap(snd_mi) corrs.append({'matrix': fst_mi.to_dict('split'), 'path': fst_svg}) corrs.append({'matrix': snd_mi.to_dict('split'), 'path': snd_svg}) return corrs
def main(): parser = argparse.ArgumentParser( description='Serialize patterns of a dataset anonymously', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of a csv file to be patterned ' 'anonymously') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which ' 'will be replaced with a pseudonym. It only works ' 'on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of anonymous patterns (default " "is input file name with a suffix '-pattern.json')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group.add_argument('--sep', metavar='STRING', help='specify the delimiter of the input file') group = parser.add_argument_group('advanced arguments') group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) header = None if args.no_header else 'infer' sep = ',' if args.sep is None else args.sep data = read_data_from_csv(args.file, na_values=na_values, header=header, sep=sep) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit(message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit(message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit(message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) if args.output is None: name = file_name(args.file) args.output = f'{name}-pattern.json' dataset.to_pattern(path=args.output, epsilon=args.epsilon, deletes=deletes, pseudonyms=pseudonyms, retains=[]) duration = time.time() - start print(f'Analyze and serialize the patterns of {args.file} at {args.output} ' f'in {round(duration, 2)} seconds.')
def main(): parser = argparse.ArgumentParser( description='Synthesize one dataset by differential privacy', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of a csv file to be synthesized ' 'or path of a pattern file to be generated') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument('--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which ' 'will be replaced with a pseudonym. It only works ' 'on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of output synthesized dataset (" "default is input file name with suffix '-a.csv')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group.add_argument('--records', metavar='INT', type=int, help='specify the records you want to generate; default ' 'is the same records with the original dataset') group.add_argument('--sep', metavar='STRING', default=',', help='specify the delimiter of the input file') group = parser.add_argument_group('advanced arguments') group.add_argument('-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument('--retain', metavar='LIST', help='set columns to retain the values') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) retains = str_to_list(args.retain) header = None if args.no_header else 'infer' # check the file type from its extension is_pattern = ends_with_json(args.file) if is_pattern: if retains is not None and len(retains) != 0: parser.exit(message='Do not support --retain option when ' 'synthesize from pattern file.') # construct DataSet from pattern file dataset = DataSet.from_pattern(args.file) else: data = read_data_from_csv(args.file, na_values=na_values, header=header, sep=args.sep) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit( message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit( message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit( message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) synthesized = dataset.synthesize(epsilon=args.epsilon, pseudonyms=pseudonyms, deletes=deletes, retains=retains, records=args.records) if args.output is None: name = file_name(args.file) args.output = f'{name}-a.csv' synthesized.to_csv(args.output, index=False, sep=args.sep) duration = time.time() - start print(f'Synthesize from {args.file} to file {args.output} in ' f'{round(duration, 2)} seconds.')
def main(): parser = argparse.ArgumentParser( description='Synthesize one dataset by Differential Privacy', formatter_class=CustomFormatter, add_help=False) parser.add_argument('file', help='set path of the CSV to be synthesized') # optional arguments group = parser.add_argument_group('general arguments') group.add_argument("-h", "--help", action="help", help="show this help message and exit") group.add_argument( '--pseudonym', metavar='LIST', help='set candidate columns separated by a comma, which will be ' 'replaced with a pseudonym. ' 'It only works on the string column.') group.add_argument('--delete', metavar='LIST', help='set columns separated by a comma, which will be ' 'deleted when synthesis.') group.add_argument('--na-values', metavar='LIST', help='set additional values to recognize as NA/NaN; ' '(default null values are from pandas.read_csv)') group.add_argument('-o', '--output', metavar='FILE', help="set the file name of output synthesized dataset (" "default is input file name with suffix '_a')") group.add_argument('--no-header', action='store_true', help='indicate there is no header in a CSV file, and ' 'will take [#0, #1, #2, ...] as header. (default: ' 'the tool will try to detect and take actions)') group = parser.add_argument_group('advanced arguments') group.add_argument( '-e', '--epsilon', metavar='FLOAT', type=float, help='set epsilon for differential privacy (default 0.1)', default=0.1) group.add_argument('--category', metavar='LIST', help='set categorical columns separated by a comma.') group.add_argument('--retain', metavar='LIST', help='set columns to retain the values') args = parser.parse_args() start = time.time() pseudonyms = str_to_list(args.pseudonym) deletes = str_to_list(args.delete) categories = str_to_list(args.category) na_values = str_to_list(args.na_values) retains = str_to_list(args.retain) header = None if args.no_header else 'infer' data = read_data_from_csv(args.file, na_values=na_values, header=header) def complement(attrs, full): return set(attrs or []) - set(full) # check parameters: pseudonyms, deletes, categories comp = complement(pseudonyms, data.columns) if comp: parser.exit( message=f'--pseudonym columns: {comp} are not in csv file.') comp = complement(deletes, data.columns) if comp: parser.exit(message=f'--delete columns: {comp} are not in csv file.') comp = complement(categories, data.columns) if comp: parser.exit(message=f'--category columns: {comp} are not in csv file.') dataset = DataSet(data, categories=categories) synthesized = dataset.synthesize(epsilon=args.epsilon, pseudonyms=pseudonyms, deletes=deletes, retains=retains) if args.output is None: name = file_name(args.file) args.output = f'{name}_a.csv' synthesized.to_csv(args.output, index=False) duration = time.time() - start print(f'Synthesized data {args.output} in {round(duration, 2)} seconds.')