# In[22]: # label candidate set and name the label column as gold_label L = mg.label_table(S, 'gold_label') # In[ ]: # In[24]: # get features automatically (internally it computes types, attr_corres, sim functions, tokenizers ) feat_table = mg.get_features_for_blocking(A, B) # In[25]: # display feature table feat_table # In[26]: # see what tokenizers were used to generate features mg._current_tokenizers # In[27]:
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None): """ Extract feature vectors Parameters ---------- s : MTable, labeled virtual MTable or combined blocker output attrs_before : list, defaults to None List of attribute names from "s" to be included in output table before the feature vector feat_table : pandas DataFrame, defaults to None List of features to be applied (also see: mg.get_features_for_blocking) attrs_after : list, defaults to None List of attribute names from "s" to be included in output table after the feature vector Returns ------- feature_vectors : MTable, Containing features values (obtained by applying feature fns in feat_table) and attributes as mentioned in the input """ # basic checks assert isJVMStarted(), 'JVM should be started using init_jvm to compute features' ltable = s.get_property('ltable') rtable = s.get_property('rtable') assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' if feature_table is None: feature_table = mg.get_features_for_blocking(ltable, rtable) l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property('foreign_key_rtable') start = time.time() id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()] end = time.time() logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %(len(s), end - start)) # compute feature values l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df.set_index(rtable.get_key(), inplace=True, drop=False) start = time.time() feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list] end = time.time() logging.getLogger(__name__).info('Applying feature functions took : %f secs' % (end - start)) table = pd.DataFrame(feat_vals, index=s.index.values) # get the feature names and re-arrange columns in that order feat_names = list(feature_table['feature_name']) table = table[feat_names] # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before.reverse() for a in attrs_before: table.insert(0, a, s[a]) table.insert(0, r_key, s[r_key]) table.insert(0, l_key, s[l_key]) # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after.reverse() for a in attrs_after: table.insert(len(table.columns), a, s[a]) # reset the table index table.reset_index(inplace=True, drop=True) feature_vectors = MTable(table) if s.get_key() not in feature_vectors.columns: feature_vectors.add_key(s.get_key()) # metadata feature_vectors._metadata = s._metadata feature_vectors.properties = s.properties return feature_vectors
# In[21]: # sample candidate set F S = mg.sample_table(F, 13) # In[22]: # label candidate set and name the label column as gold_label L = mg.label_table(S, 'gold_label') # In[ ]: # In[24]: # get features automatically (internally it computes types, attr_corres, sim functions, tokenizers ) feat_table = mg.get_features_for_blocking(A, B) # In[25]: # display feature table feat_table # In[26]: # see what tokenizers were used to generate features mg._current_tokenizers # In[27]: # see what simfunctions were used to generate features mg._current_sim_funs
# coding=utf-8 import magellan as mg A = mg.load_dataset('table_A', key='ID') B = mg.load_dataset('table_B', key='ID') F = mg.get_features_for_blocking(A, B) print(F)
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None): """ Extract feature vectors Parameters ---------- s : MTable, labeled virtual MTable or combined blocker output attrs_before : list, defaults to None List of attribute names from "s" to be included in output table before the feature vector feat_table : pandas DataFrame, defaults to None List of features to be applied (also see: mg.get_features_for_blocking) attrs_after : list, defaults to None List of attribute names from "s" to be included in output table after the feature vector Returns ------- feature_vectors : MTable, Containing features values (obtained by applying feature fns in feat_table) and attributes as mentioned in the input """ # basic checks assert isJVMStarted( ), 'JVM should be started using init_jvm to compute features' ltable = s.get_property('ltable') rtable = s.get_property('rtable') assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' if feature_table is None: feature_table = mg.get_features_for_blocking(ltable, rtable) l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property( 'foreign_key_rtable') start = time.time() id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()] end = time.time() logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' % (len(s), end - start)) # compute feature values l_df = ltable.to_dataframe() r_df = rtable.to_dataframe() l_df.set_index(ltable.get_key(), inplace=True, drop=False) r_df.set_index(rtable.get_key(), inplace=True, drop=False) start = time.time() feat_vals = [ apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list ] end = time.time() logging.getLogger(__name__).info( 'Applying feature functions took : %f secs' % (end - start)) table = pd.DataFrame(feat_vals, index=s.index.values) # get the feature names and re-arrange columns in that order feat_names = list(feature_table['feature_name']) table = table[feat_names] # insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before.reverse() for a in attrs_before: table.insert(0, a, s[a]) table.insert(0, r_key, s[r_key]) table.insert(0, l_key, s[l_key]) # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after.reverse() for a in attrs_after: table.insert(len(table.columns), a, s[a]) # reset the table index table.reset_index(inplace=True, drop=True) feature_vectors = MTable(table) if s.get_key() not in feature_vectors.columns: feature_vectors.add_key(s.get_key()) # metadata feature_vectors._metadata = s._metadata feature_vectors.properties = s.properties return feature_vectors