def down_sample(ltable, rtable, size, y): s_table, b_table, is_swapped = _order_tables(ltable, rtable) s_inv_index = _inv_index(s_table) b_sample_size = min(math.floor(size/y), len(b_table)) b_tbl_indices = np.random.choice(len(b_table), b_sample_size, replace=False) s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table), s_inv_index) if is_swapped: s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], ltable.get_key()) l_sampled.properties = ltable.properties r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], rtable.get_key()) r_sampled.properties = rtable.properties return l_sampled, r_sampled
def sample_table(table, size, replace=False): """ Sample MTable Parameters ---------- table : MTable, input table to be sampled size : int, number of samples replace : boolean, whether sampling should be done with replacement. By default, it is set to False. Returns ------- sampled_table: MTable, sampled table """ if len(table) == 0: raise AttributeError('size of table is 0') if len(table) < size: raise AttributeError('sample size is larger than input table size') s_indices = np.random.choice(len(table), size, replace=replace) # sort the indices - just to have an order s_indices = sorted(s_indices) sampled_table = table.iloc[list(s_indices)] #print sampled_table.properties sampled_table = MTable(sampled_table, key=table.get_key()) sampled_table.properties = table.properties return sampled_table
def down_sample(ltable, rtable, size, y): s_table, b_table, is_swapped = _order_tables(ltable, rtable) s_inv_index = _inv_index(s_table) b_sample_size = min(math.floor(size / y), len(b_table)) b_tbl_indices = list( np.random.choice(len(b_table), b_sample_size, replace=False)) s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table), s_inv_index) s_tbl_indices = list(s_tbl_indices) if is_swapped: s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], key=ltable.get_key()) l_sampled.properties = ltable.properties r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], key=rtable.get_key()) r_sampled.properties = rtable.properties return l_sampled, r_sampled
def down_sample(s_table, b_table, size, y): if len(b_table) < size: print 'Warning!! size of table B is less than b_size parameter - using entire table B' size = len(b_table) t1 = time.time() s_inv_index = _inv_index(s_table) print 'Inverted Index Time: ' print int(time.time() - t1) b_sample_size = min(math.floor(size/y), len(b_table)) b_tbl_indices = list(np.random.choice(len(b_table), b_sample_size, replace=False)) t1 = time.time() s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table), s_inv_index) print 'Probe Index Time: ' print int(time.time() - t1) s_tbl_indices = list(s_tbl_indices) l_sampled = MTable(s_table.iloc[list(s_tbl_indices)], key=s_table.get_key()) l_sampled.properties = s_table.properties r_sampled = MTable(b_table.iloc[list(b_tbl_indices)], key=b_table.get_key()) r_sampled.properties = b_table.properties return l_sampled, r_sampled
def label_table(tbl, col_name, replace=True): """ Label training data Parameters ---------- tbl : MTable, Table to be labeled col_name : String, Name of the label column replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists. [This option is currently experimental]. Returns ------- result : MTable, Table with labels Notes ----- The label value is expected to be only 0 or 1. """ from magellan.gui.mtable_gui import edit table = tbl.copy() if col_name in table.columns: if replace == True: logging.getLogger(__name__).warning( 'Input table already contains column %s. ' '' % col_name) table[col_name] = 0 else: table[col_name] = 0 mg.edit(table) table[col_name] = table[col_name].astype(int) # check if the table contains only 0s and 1s c1 = table[col_name] == 1 c2 = table[col_name] == 0 c = sum(c1 | c2) assert c == len( table), 'The label column contains values other than 0 and 1' table = MTable(table, key=tbl.get_key()) table.properties = tbl.properties return table
def label_table(tbl, col_name, replace=True): """ Label training data Parameters ---------- tbl : MTable, Table to be labeled col_name : String, Name of the label column replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists. [This option is currently experimental]. Returns ------- result : MTable, Table with labels Notes ----- The label value is expected to be only 0 or 1. """ from magellan.gui.mtable_gui import edit table = tbl.copy() if col_name in table.columns: if replace == True: logging.getLogger(__name__).warning('Input table already contains column %s. ' '' %col_name) table[col_name] = 0 else: table[col_name] = 0 mg.edit(table) table[col_name] = table[col_name].astype(int) # check if the table contains only 0s and 1s c1 = table[col_name] == 1 c2 = table[col_name] == 0 c = sum(c1|c2) assert c == len(table), 'The label column contains values other than 0 and 1' table = MTable(table, key=tbl.get_key()) table.properties = tbl.properties return table
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None): """ Extract feature vectors Parameters ---------- s : MTable, labeled virtual MTable or combined blocker output attrs_before : list, defaults to None List of attribute names from "s" to be included in output table before the feature vector feat_table : pandas DataFrame, defaults to None List of features to be applied (also see: mg.get_features_for_blocking) attrs_after : list, defaults to None List of attribute names from "s" to be included in output table after the feature vector Returns ------- feature_vectors : MTable, Containing features values (obtained by applying feature fns in feat_table) and attributes as mentioned in the input """ # basic checks assert isJVMStarted(), 'JVM should be started using init_jvm to compute features' ltable = s.get_property('ltable') rtable = s.get_property('rtable') assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' if feature_table is None: feature_table = mg.get_features_for_blocking(ltable, rtable) l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property('foreign_key_rtable') start = time.time() id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()] end = time.time() logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %(len(s), end - start)) # compute feature values l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df.set_index(rtable.get_key(), inplace=True, drop=False) start = time.time() feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list] end = time.time() logging.getLogger(__name__).info('Applying feature functions took : %f secs' % (end - start)) table = pd.DataFrame(feat_vals, index=s.index.values) # get the feature names and re-arrange columns in that order feat_names = list(feature_table['feature_name']) table = table[feat_names] # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before.reverse() for a in attrs_before: table.insert(0, a, s[a]) table.insert(0, r_key, s[r_key]) table.insert(0, l_key, s[l_key]) # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after.reverse() for a in attrs_after: table.insert(len(table.columns), a, s[a]) # reset the table index table.reset_index(inplace=True, drop=True) feature_vectors = MTable(table) if s.get_key() not in feature_vectors.columns: feature_vectors.add_key(s.get_key()) # metadata feature_vectors._metadata = s._metadata feature_vectors.properties = s.properties return feature_vectors
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None): """ Extract feature vectors Parameters ---------- s : MTable, labeled virtual MTable or combined blocker output attrs_before : list, defaults to None List of attribute names from "s" to be included in output table before the feature vector feat_table : pandas DataFrame, defaults to None List of features to be applied (also see: mg.get_features_for_blocking) attrs_after : list, defaults to None List of attribute names from "s" to be included in output table after the feature vector Returns ------- feature_vectors : MTable, Containing features values (obtained by applying feature fns in feat_table) and attributes as mentioned in the input """ # basic checks assert isJVMStarted( ), 'JVM should be started using init_jvm to compute features' ltable = s.get_property('ltable') rtable = s.get_property('rtable') assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' if feature_table is None: feature_table = mg.get_features_for_blocking(ltable, rtable) l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property( 'foreign_key_rtable') start = time.time() id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()] end = time.time() logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' % (len(s), end - start)) # compute feature values l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df.set_index(rtable.get_key(), inplace=True, drop=False) start = time.time() feat_vals = [ apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list ] end = time.time() logging.getLogger(__name__).info( 'Applying feature functions took : %f secs' % (end - start)) table = pd.DataFrame(feat_vals, index=s.index.values) # get the feature names and re-arrange columns in that order feat_names = list(feature_table['feature_name']) table = table[feat_names] # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before.reverse() for a in attrs_before: table.insert(0, a, s[a]) table.insert(0, r_key, s[r_key]) table.insert(0, l_key, s[l_key]) # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after.reverse() for a in attrs_after: table.insert(len(table.columns), a, s[a]) # reset the table index table.reset_index(inplace=True, drop=True) feature_vectors = MTable(table) if s.get_key() not in feature_vectors.columns: feature_vectors.add_key(s.get_key()) # metadata feature_vectors._metadata = s._metadata feature_vectors.properties = s.properties return feature_vectors