def test_eval_matches_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'predicted', 'gold') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 14) self.assertEqual(result['prec_denominator'], 14) self.assertAlmostEqual(result['precision'], 1) self.assertEqual(result['recall_numerator'], 14) self.assertEqual(result['recall_denominator'], 15) self.assertEqual(result['recall'], 0.9333333333333333) self.assertEqual(result['f1'], 0.9655172413793104) self.assertEqual(result['pred_pos_num'], 14) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 1) self.assertEqual(result['false_neg_num'], 1.0) self.assertEqual(len(result['false_neg_ls']), 1) t = result['false_neg_ls'][0] self.assertEqual(t[0], 'a1') self.assertEqual(t[1], 'b1')
def _post_process_labelled_table(input_table, labeled_table, col_name): """ This function post processes the labeled table and updates the catalog. Specifically, this function validates that the label column contain only 0 and 1's, and finally copies the properties from the input table to the output table. """ # Cast the label values to int as initially they will be strings when it # comes from the GUI labeled_table[col_name] = labeled_table[col_name].astype(int) # Check if the table contains only 0s and 1s label_value_with_1 = labeled_table[col_name] == 1 label_value_with_0 = labeled_table[col_name] == 0 sum_of_labels = sum(label_value_with_1 | label_value_with_0) # If they contain column values other than 0 and 1, raise an error if not sum_of_labels == len(labeled_table): logger.error('The label column contains values other than 0 and 1') raise AssertionError( 'The label column contains values other than 0 and 1') # Copy the properties from the input table to label table. # Note: Here we dont have to check for the integrity of 'key' because the # key column is not tampered from the input table. cm.init_properties(labeled_table) cm.copy_properties(input_table, labeled_table) # Return the label table return labeled_table
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0]*num_ones # gold.extend([1]*num_zeros) predicted = [1]* (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln+1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, inplace=True): if x is not None: y = self.predict_sklearn(x) # if table is not None and target_attr is not None and append is True: # if inplace == True: # table[target_attr] = y # return table # else: # tbl = table.copy() # tbl[target_attr] = y # return tbl elif table is not None and exclude_attrs is not None: y = self.predict_ex_attrs(table, exclude_attrs) if target_attr is not None and append is True: if inplace == True: table[target_attr] = y return table else: tbl = table.copy() tbl[target_attr] = y cm.copy_properties(table, tbl) return tbl else: raise SyntaxError('The arguments supplied does not match the signatures supported !!!') return y
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0] * num_ones # gold.extend([1]*num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, inplace=True): if x is not None: y = self.predict_sklearn(x) # if table is not None and target_attr is not None and append is True: # if inplace == True: # table[target_attr] = y # return table # else: # tbl = table.copy() # tbl[target_attr] = y # return tbl elif table is not None and exclude_attrs is not None: y = self.predict_ex_attrs(table, exclude_attrs) if target_attr is not None and append is True: if inplace == True: table[target_attr] = y return table else: tbl = table.copy() tbl[target_attr] = y cm.copy_properties(table, tbl) return tbl else: raise SyntaxError( 'The arguments supplied does not match the signatures supported !!!' ) return y
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) if validate: cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def test_copy_properties_update_false_2(self): A = read_csv_metadata(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1, update=False) p = cm.get_all_properties(A) p1 = cm.get_all_properties(A1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(A1), cm.get_key(A))
def test_copy_properties_valid_1(self): A = read_csv_metadata(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1) self.assertEqual(cm.is_dfinfo_present(A1), True) p = cm.get_all_properties(A) p1 = cm.get_all_properties(A1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(A1), cm.get_key(A))
def train_test_split(labeled_data, train_proportion=0.5, random_state=None, verbose=True): if not isinstance(labeled_data, pd.DataFrame): logger.error('Input table is not of type dataframe') raise AssertionError('Input table is not of type dataframe') log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( labeled_data, logger, verbose) # # validate metadata cm.validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) num_rows = len(labeled_data) assert train_proportion >= 0 and train_proportion <= 1, " Train proportion is expected to be between 0 and 1" assert num_rows > 0, 'The input table is empty' train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) # use sk learn to split the data idx_values = pd.np.array(labeled_data.index.values) idx_train, idx_test = cv.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # construct output tables. lbl_train = labeled_data.ix[idx_train] lbl_test = labeled_data.ix[idx_test] # update catalog cm.init_properties(lbl_train) cm.copy_properties(labeled_data, lbl_train) cm.init_properties(lbl_test) cm.copy_properties(labeled_data, lbl_test) # return output tables result = OrderedDict() result['train'] = lbl_train result['test'] = lbl_test return result
def _test_label_table(self, table, col_name, label_values): _validate_inputs(table, col_name, verbose=False) lbl_table = _init_label_table(table, col_name) from magellan.gui.table_gui import edit_table edit_table(lbl_table, show_flag=False) new_table = lbl_table.copy() cm.copy_properties(table, new_table) lbl_table = new_table lbl_table[col_name] = label_values lbl_table = _post_process_labelled_table(table, lbl_table, col_name) return lbl_table
def test_copy_properties_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = pd.read_csv(path_c) cm.copy_properties(C, C1) self.assertEqual(cm.is_dfinfo_present(C1), True) p = cm.get_all_properties(C1) p1 = cm.get_all_properties(C1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(C1), cm.get_key(C)) self.assertEqual(cm.get_ltable(C1).equals(A), True) self.assertEqual(cm.get_rtable(C1).equals(B), True) self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C)) self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
def test_eval_matches_predicted_attr_not_in_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'gold', 'predicted1')
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) if validate: cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=True): if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # validate input parameters if attrs_before != None: if not check_attrs_present(candset, attrs_before): logger.error('The attributes mentioned in attrs_before is not present ' \ 'in the input table') raise AssertionError('The attributes mentioned in attrs_before is not present ' \ 'in the input table') if attrs_after != None: if not check_attrs_present(candset, attrs_after): logger.error('The attributes mentioned in attrs_after is not present ' \ 'in the input table') raise AssertionError('The attributes mentioned in attrs_after is not present ' \ 'in the input table') if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) # # validate metadata cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # extract features id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # apply feature functions feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list] # construct output table table = pd.DataFrame(feat_vals) # # rearrange the feature names in the given order feat_names = list(feature_table['feature_name']) table = table[feat_names] # # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: table.insert(0, a, candset[a]) # # insert keys table.insert(0, fk_rtable, candset[fk_rtable]) table.insert(0, fk_ltable, candset[fk_ltable]) table.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(table.columns) for a in attrs_after: table.insert(col_pos, a, candset[a]) col_pos += 1 # reset the index table.reset_index(inplace=True, drop=True) # # update the catalog cm.init_properties(table) cm.copy_properties(candset, table) return table
import magellan as mg import pandas as pd import magellan.catalog.catalog_manager as cm A = mg.load_dataset('table_A') B = pd.read_csv('../magellan/datasets/B.csv') cm.init_properties(B) cm.copy_properties(A, B) print 'hi'
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=True): if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # validate input parameters if attrs_before != None: if not check_attrs_present(candset, attrs_before): logger.error('The attributes mentioned in attrs_before is not present ' \ 'in the input table') raise AssertionError('The attributes mentioned in attrs_before is not present ' \ 'in the input table') if attrs_after != None: if not check_attrs_present(candset, attrs_after): logger.error('The attributes mentioned in attrs_after is not present ' \ 'in the input table') raise AssertionError('The attributes mentioned in attrs_after is not present ' \ 'in the input table') if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm.validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # extract features id_list = [(r[fk_ltable], r[fk_rtable]) for i, r in candset.iterrows()] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # apply feature functions feat_vals = [ apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list ] # construct output table table = pd.DataFrame(feat_vals) # # rearrange the feature names in the given order feat_names = list(feature_table['feature_name']) table = table[feat_names] # # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: table.insert(0, a, candset[a]) # # insert keys table.insert(0, fk_rtable, candset[fk_rtable]) table.insert(0, fk_ltable, candset[fk_ltable]) table.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(table.columns) for a in attrs_after: table.insert(col_pos, a, candset[a]) col_pos += 1 # reset the index table.reset_index(inplace=True, drop=True) # # update the catalog cm.init_properties(table) cm.copy_properties(candset, table) return table
def sample_table(table, sample_size, replace=False, verbose=False): """ Sample a pandas DataFrame (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): Input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): Number of samples to be picked up from the input DataFrame. replace (boolean): Flag to indicate whether sampling should be done with replacement or not (default value is False). verbose (boolean): Flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If the input table is not of type pandas DataFrame. AssertionError: If the input DataFrame size is 0. AssertionError: If the sample_size is greater than the input DataFrame size. Notes: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type pandas dataframe') raise AssertionError('Input table is not of type pandas dataframe') # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def test_copy_properties_invalid_tar_df(self): A = read_csv_metadata(path_a) cm.copy_properties(A, None)
def test_copy_properties_invalid_src_df(self): A = read_csv_metadata(path_a) cm.copy_properties(None, A)
def sample_table(table, sample_size, replace=False, verbose=False): """ Sample a pandas DataFrame (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): Input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): Number of samples to be picked up from the input DataFrame. replace (boolean): Flag to indicate whether sampling should be done with replacement or not (default value is False). verbose (boolean): Flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If the input table is not of type pandas DataFrame. AssertionError: If the input DataFrame size is 0. AssertionError: If the sample_size is greater than the input DataFrame size. Notes: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type pandas dataframe') raise AssertionError('Input table is not of type pandas dataframe') # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm.validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def test_copy_properties_update_false_1(self): A = read_csv_metadata(path_a) A1 = read_csv_metadata(path_a) status=cm.copy_properties(A, A1, update=False) self.assertEqual(status, False)
def test_copy_properties_src_df_notin_catalog(self): A = pd.read_csv(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1)