def validate_metadata_two_candsets(C, D): assert_equal(sorted(C.columns), sorted(D.columns)) assert_equal(em.get_key(D), em.get_key(C)) assert_equal(em.get_property(D, 'fk_ltable'), em.get_property(C, 'fk_ltable')) assert_equal(em.get_property(D, 'fk_rtable'), em.get_property(C, 'fk_rtable'))
def validate_data(C, expected_ids=None): if expected_ids: lid = em.get_property(C, 'fk_ltable') rid = em.get_property(C, 'fk_rtable') C_ids = C[[lid, rid]].set_index([lid, rid]) actual_ids = sorted(C_ids.index.values.tolist()) assert_equal(expected_ids, actual_ids) else: assert_equal(len(C), 0)
def validate_metadata(C, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', l_key='ID', r_key='ID'): s1 = ['_id', l_output_prefix + l_key, r_output_prefix + r_key] if l_output_attrs: s1 += [l_output_prefix + x for x in l_output_attrs if x != l_key] if r_output_attrs: s1 += [r_output_prefix + x for x in r_output_attrs if x != r_key] s1 = sorted(s1) assert_equal(s1, sorted(C.columns)) assert_equal(em.get_key(C), '_id') assert_equal(em.get_property(C, 'fk_ltable'), l_output_prefix + l_key) assert_equal(em.get_property(C, 'fk_rtable'), r_output_prefix + r_key)
import warnings import numpy as np import re warnings.filterwarnings('ignore') #Reading A and B A = em.read_csv_metadata("/mnt/c/Users/sreya/Downloads/DS/bestbuy_music.csv", key="ID") B = em.read_csv_metadata( "/mnt/c/Users/sreya/Downloads/DS/metacritic_music.csv", key="ID") # Setting the Keys em.set_key(A, 'ID') em.set_key(B, 'ID') em.get_property(A, 'key') #Reading in the Sampled Candidate set (450 tuples) obtained after blocking G = em.read_csv_metadata( "/mnt/c/Users/sreya/Downloads/DS/sampled_candidate_set.csv", key='_id', ltable=A, rtable=B, fk_ltable='ltable_ID', fk_rtable='rtable_ID') #Split into I and J (train and test) IJ = em.split_train_test(G, train_proportion=0.7, random_state=0) I = IJ['train'] J = IJ['test']