Exemplo n.º 1
0
def main():
    A = em.read_csv_metadata('../Data/A_imdb.csv', key='id')
    B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id')
    ab = em.AttrEquivalenceBlocker()
    shared_attributes = ['title', 'directors', 'release_year', 'languages']
    C = ab.block_tables(A,
                        B,
                        'directors',
                        'directors',
                        l_output_attrs=shared_attributes,
                        r_output_attrs=shared_attributes)
    # Take a sample of 10 pairs
    S = em.sample_table(C, 100)
    print(S)
    G = em.label_table(S, label_column_name='gold_labels')
    train_test = em.split_train_test(G, train_proportion=0.5)
    train, test = train_test['train'], train_test['test']
    # Get feature for matching
    match_f = em.get_features_for_matching(A, B)
    H = em.extract_feature_vecs(train,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    H.fillna(value=0, inplace=True)
    print(H)
    # Specifying Matchers and Performing Matching.
    dt = em.DTMatcher(max_depth=5)  # A decision tree matcher.
    # Train the matcher
    dt.fit(table=H,
           exclude_attrs=[
               '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title',
               'gold_labels'
           ],
           target_attr='gold_labels')
    # Predict
    F = em.extract_feature_vecs(test,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    F.fillna(value=0, inplace=True)
    print(F)
    pred_table = dt.predict(table=F,
                            exclude_attrs=[
                                '_id', 'ltable_id', 'rtable_id',
                                'ltable_title', 'rtable_title', 'gold_labels'
                            ],
                            target_attr='predicted_labels',
                            return_probs=True,
                            probs_attr='proba',
                            append=True,
                            inplace=True)
    print(pred_table)
    eval_summary = em.eval_matches(pred_table, 'gold_labels',
                                   'predicted_labels')
    em.print_eval_summary(eval_summary)
Exemplo n.º 2
0
bb.set_black_box_function(city_first_letter_match)
C = bb.block_candset(C)
len(C)

bb.set_black_box_function(first_address_number_match)
C = bb.block_candset(C)
len(C)

bb.set_black_box_function(first_phone_number_match)
C = bb.block_candset(C)
len(C)
"""

print("\n- Sampling and Labeling...")
S = em.sample_table(C, 50)

G = em.label_table(S, 'label')

IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

ltable_pk = "ltable_" + pk_A
rtable_pk = "rtable_" + pk_B

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
Exemplo n.º 3
0
                    ])
# Block rule 2: authors share more than two words, split by whitespaces
D = ob.block_candset(C,
                     'author',
                     'author',
                     rem_stop_words=True,
                     overlap_size=2)
# Debugging the
dbg = em.debug_blocker(D, A, B, output_size=200)
dbg.head()

# Save Table D
#em.to_csv_metadata(D, './TableD.csv')

# Sample candidate set of size 300
S = em.sample_table(D, 300)
# Gold label
#G = em.label_table(S, label_column_name = 'gold_labels')
G = em.read_csv_metadata('./labeled.csv',
                         key='_id',
                         fk_ltable='ltable_ID',
                         fk_rtable='rtable_ID',
                         ltable=A,
                         rtable=B)
# Split training set and test set
train_test = em.split_train_test(G, train_proportion=0.5)
I = train_test['train']
I['ltable_edition'] = ''
I['rtable_edition'] = ''
I['ltable_pages'] = ''
I['rtable_pages'] = ''
use_stop_words = False

C = ob.block_candset(C,
                     l_overlap_attr=blocking_col,
                     r_overlap_attr=blocking_col,
                     rem_stop_words=use_stop_words,
                     q_val=q_gram_value,
                     word_level=use_world_level,
                     overlap_size=overlap_size,
                     allow_missing=add_missing_val,
                     n_jobs=-1,
                     show_progress=False)
len(C)

print("\n- Sampling and Labeling...")
S = em.sample_table(C, 100)

G = em.label_table(S, 'label')

IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

ltable_pk = "ltable_" + pk_A
rtable_pk = "rtable_" + pk_B

dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
Exemplo n.º 5
0
def main():

    A = em.read_csv_metadata('ltable.csv',
                             key="ltable_id",
                             encoding='ISO-8859-1')
    B = em.read_csv_metadata('rtable.csv',
                             key="rtable_id",
                             encoding='ISO-8859-1')

    ob = em.OverlapBlocker()
    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        overlap_size=1,
        show_progress=False)
    S = em.sample_table(C, 450)

    G = em.read_csv_metadata("train.csv",
                             key='id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_id',
                             fk_rtable='rtable_id')
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    G = em.label_table(S, 'label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    H = em.extract_feature_vecs(G,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                attrs_after='label',
                                show_progress=False)
    H.fillna('0', inplace=True)
    #     H = em.impute_table(
    #         H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean')
    rf = em.RFMatcher()

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label'])
    attrs_to_be_excluded.extend(attrs_from_table)

    rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=False,
                                n_jobs=-1)

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    predictions = rf.predict(table=L,
                             exclude_attrs=attrs_to_be_excluded,
                             append=True,
                             target_attr='predicted',
                             inplace=False)

    dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']})
    dataset.to_csv("./prediction2.csv", index=False)
Exemplo n.º 6
0
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' +
      str(len(A) * len(B)))

ob = em.OverlapBlocker()
C = ob.block_tables(A,
                    B,
                    'name',
                    'name',
                    l_output_attrs=['name', 'addr', 'city', 'phone'],
                    r_output_attrs=['name', 'addr', 'city', 'phone'],
                    overlap_size=1,
                    show_progress=False)
len(C)

S = em.sample_table(C, 450)

#G = em.label_table(S, 'gold') # This step raises an error

path_G = em.get_install_path(
) + os.sep + 'datasets' + os.sep + 'end-to-end' + os.sep + 'restaurants/lbl_restnt_wf1.csv'
G = em.read_csv_metadata(path_G,
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_id',
                         fk_rtable='rtable_id')
len(G)

IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
I = IJ['train']
Exemplo n.º 7
0
D = em.combine_blocker_outputs_via_union([C2, C3])

# C3 = C1
# Use block_tables to apply blocking over two input tables.

# corres = [('title', 'title'), ('year', 'year')]
# sample_movies3 = em.read_csv_metadata('datasets/tmp_movies.csv')
# sample_tracks3 = em.read_csv_metadata('datasets/tmp_tracks.csv')
# D = em.debug_blocker(C3, sample_movies3, sample_tracks3, attr_corres=corres)

em.to_csv_metadata(D, 'datasets/tbl_blocked_8.csv')
tbl_blocked = em.read_csv_metadata('datasets/tbl_blocked_8.csv',\
 ltable=sample_movies, rtable=sample_tracks)

S = em.sample_table(tbl_blocked, 400)
em.to_csv_metadata(S, 'datasets/sampled_8.csv')

with open('metadata_8.csv', 'wb') as data1:
    writer = csv.writer(data1, delimiter=',', quotechar='|')
    for entry in S.values:
        l = len(entry)
        item = entry[-4:]
        writer.writerow(item)

match_f = em.get_features_for_matching(sample_movies, sample_tracks)
H = em.extract_feature_vecs(S, feature_table=match_f)
with open('data_8.csv', 'wb') as data:
    writer = csv.writer(data, delimiter=',', quotechar='|')
    flag = 0
    names = []
Exemplo n.º 8
0
# Rule based blocker after D
block_f = em.get_features_for_blocking(A,
                                       B,
                                       validate_inferred_attr_types=False)
rb = em.RuleBasedBlocker()
# print(block_f)
rb.add_rule(['Title_Title_lev_sim(ltuple, rtuple) < 0.4'], block_f)
C = rb.block_candset(D, show_progress=False)
print("C Set Size: ", len(C))

# debug blocker
E = em.debug_blocker(C, A, B, output_size=200)
E.head()
# print(len(E))
S = em.sample_table(C, 300)
# S.to_csv('S.csv')

# G = em.label_table(S, 'label')


def f(row):
    if str(row['ltable_Title']).lower() == str(row['rtable_Title']).lower():
        val = 1
    else:
        val = 0
    return val


G = S.copy()
G['label'] = S.apply(f, axis=1)
F.shape


# In[255]:


F.head()


# Taking a sample of 600 tuples from the output, and then we label this sample manually.

# In[16]:


S = em.sample_table(F, 600)
S.to_csv('Sample.csv',encoding = 'cp1252')


# ## Reading the Labelled Sample##
# Loading the labeled data table, which is present in a file called 'Labelled_Sample_v2.csv'

# In[256]:


L = em.read_csv_metadata("Labelled_Sample_v2.csv", key='_id', encoding = 'cp1252',                         ltable=A, rtable=B,fk_ltable='ltable_Id', fk_rtable='rtable_Id')


# Deleting *Phone* attribute again, because it can help determine matches trivially.

# In[14]:
Exemplo n.º 10
0
                                      'Directors',
                                      word_level=True,
                                      overlap_size=1,
                                      show_progress=False)

# Creating instance for attribute blocker
ab = em.AttrEquivalenceBlocker()

# Blocking previous level blocker using matching year attribute
attribute_candidate = ab.block_candset(overlap_candidate1,
                                       'Year',
                                       'Year',
                                       show_progress=False)

# Randomly sampling 500 records from the candidate tuple pairs
sample = em.sample_table(attribute_candidate, 500)

# Storing this sample for labelling.
sample.to_csv("Sampleset.csv")

# Opening the csv file and cleaning and converting to new CSV File
with open('./Sampleset.csv', 'r', encoding='utf-8',
          errors='ignore') as infile, open('./Sampleset1.csv', 'w') as outfile:
    inputs = csv.reader(infile)
    outputs = csv.writer(outfile)

    for index, row in enumerate(inputs):
        outputs.writerow(row)

# Read in the labelled dataset
G = em.read_csv_metadata("./Sampleset1.csv",
Exemplo n.º 11
0
#
# * Sampling and labeling the candidate set
# * Splitting the labeled data into development and evaluation set
# * Selecting the best learning based matcher using the development set
# * Evaluating the selected matcher using the evaluation set
#
#

# <h1> Sampling and labeling the candidate set

# First, we randomly sample 350 tuple pairs for labeling purposes.

# In[105]:

##Sample candidate set
S = em.sample_table(C4, 350)

# In[23]:

##Label S
G = em.label_table(S, 'gold_labels')

# Load labeled data fom previous session

# In[105]:

G = em.load_object('./GoldenData.pkl')
len(G)

# In[106]:
Exemplo n.º 12
0
def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")
Exemplo n.º 13
0
D.to_csv('D.csv')


# In[ ]:




# In[59]:

len(D)


# In[60]:

S = em.sample_table(D, 600)


# In[61]:

S


# In[ ]:




# In[ ]: