def get_features_for_matching(A, B): """ Get features with minimal input Parameters ---------- A, B : MTable, Input tables Returns ------- feature_table : pandas DataFrame Consists of following columns * feature_name - string, feature name * left_attribute - string, attribute name * right_attribute - string, attribute name * left_attr_tokenizer - string, tokenizer name * right_attr_tokenizer - string, tokenizer name * simfunction - string, sumilarity function name * function - function object * function_source - string, containing source code Notes ----- The function also exports the important variables such as: _match_t, _match_s, _atypes1, _atypes2, _match_c; to global name space so if a user want to examine they can do so. """ sim = mg.get_sim_funs() tok = mg.get_single_arg_tokenizers() t_A = mg.get_attr_types(A) t_B = mg.get_attr_types(B) attr_corres = mg.get_attr_corres(A, B) feat_table = get_features(A, B, t_A, t_B, attr_corres, tok, sim) # export important variables to global name space #_m_current_tokenizers, _m_current_sim_funs, _m_current_attr_types_ltable, _m_current_attr_types_rtable_m_current_corres mg._match_t = tok mg._match_s = sim mg._atypes1 = t_A mg._atypes2 = t_B mg._match_c = attr_corres return feat_table
import sys sys.path.append('C:\Pradap\Research\Python-work\Saranam\magellan') A = mg.read_csv('../magellan/data/toy/table_A.csv', key='ID') B = mg.read_csv('../magellan/data/toy/table_B.csv', key='ID') blocker = mg.AttrEquivalenceBlocker() C = blocker.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address', 'hourly_wage'], ['name', 'address', 'hourly_wage']) D = mg.block_union_combine([C, C]) S = mg.sample_one_table(D, 10) #L = mg.label(S, 'gold_label') #print mg._m_global_tokenizers #print mg._m_global_sim_fns t = mg.get_single_arg_tokenizers() print t s = mg.get_sim_funs() print s corres = mg.get_attr_corres(A, B) print corres['corres'] t_1 = mg.get_attr_types(A) print t_1 t_2 = mg.get_attr_types(B) print t_2