Exemplo n.º 1
0
def block_abt_buy(A, B):
    B["description"] = B["description"] + " " + B["manufacturer"]
    ob = em.OverlapBlocker()
    #=================>results in a candidate set of size 164K with 6 missing duplicates out of 1097
    C = ob.block_tables(A, B, "name", "name", word_level=True, overlap_size=1,
	l_output_attrs=["name","description","price"], r_output_attrs=["name","description","price"], show_progress=True, allow_missing=True)
    return C
Exemplo n.º 2
0
def block_dblp_acm(A, B):
    ab = em.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, l_block_attr='year', r_block_attr='year', l_output_attrs=["title","authors","venue","year"],
        r_output_attrs=["title","authors","venue","year"], allow_missing=False)
    ob = em.OverlapBlocker()
    #=================>results in a candidate set of size 46K with 5 missing duplicates out of 2224
    C2 = ob.block_candset(C, 'title', 'title', word_level=True, overlap_size=2, show_progress=True)
    return C2
Exemplo n.º 3
0
def block_baby_products(A, B):
    ob = em.OverlapBlocker()
    # attributes = ['title', 'price', 'category', 'company_struct', 'brand', 'weight', 'length', 'width', 'height', 'fabrics', 'colors', 'materials']
    attributes = ['title', 'price', 'is_discounted', 'category', 'company_struct']
    # C = ob.block_tables(A, B, 'title', 'title', l_output_attrs=attributes,  r_output_attrs=attributes,
    #     overlap_size=3, show_progress=False)
    C = ob.block_tables(A, B, 'title', 'title', word_level = True, overlap_size = 4, show_progress = True, l_output_attrs = attributes, r_output_attrs = attributes)
    return C
Exemplo n.º 4
0
def block_songs(A, B):
    ob = em.OverlapBlocker()
    #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300
    C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1,
                        l_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"],
                        r_output_attrs=["title","release","artist_name","duration","artist_familiarity","artist_hotttnesss","year"],
                        show_progress=True, allow_missing=False,n_jobs=8)
    return C
Exemplo n.º 5
0
def block_dblp_scholar(A, B):
    ob = em.OverlapBlocker()
    attributes = ["id","title","authors","venue","year"]
    #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
    #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347
    C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=4, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
    #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347
    return C2
Exemplo n.º 6
0
def blocking_for_citeseer_dblp(A,B):
    #A = em.read_csv_metadata("citeseer_sample.csv", key="id", encoding='utf-8')
    #B = em.read_csv_metadata("dblp_sample.csv", key="id", encoding='utf-8')
    attributes = ['id', 'title', 'authors', 'journal', 'month', 'year', 'publication_type']

    ob = em.OverlapBlocker()
    C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, show_progress=True,
                         l_output_attrs=attributes, r_output_attrs=attributes)
    return C1
Exemplo n.º 7
0
def block_rotten_imdb(A, B):
    ob = em.OverlapBlocker()
    attributes = set(A.columns)
    attributes.remove("id")
    attributes = list(attributes.intersection(set(B.columns)))
    #C1 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
    #=================>results in a candidate set of size 1.2M with 178 missing duplicates out of 5347
    C2 = ob.block_tables(A, B, 'Name', 'Name', word_level=True, overlap_size=2, show_progress=True, l_output_attrs=attributes, r_output_attrs=attributes)
    #=================>results in a candidate set of size 135K with 467 missing duplicates out of 5347
    return C2
Exemplo n.º 8
0
def block_restaurants(A, B):
    #assumes some preprocessing is done:
    #Specifically in half.csv : NewPrice  => Price

    ob = em.OverlapBlocker()
    attributes = ['name', 'address', 'city', 'state', 'zipcode', 'phone']
    # C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=attributes,  r_output_attrs=attributes,
    #     overlap_size=1, show_progress=False)
    C = ob.block_tables(A, B, 'name', 'name', word_level=True, overlap_size=4, show_progress=True,
                        l_output_attrs=attributes, r_output_attrs=attributes)
    return C
Exemplo n.º 9
0
def block_books(A, B):
    #assumes some preprocessing is done:
    #Specifically in half.csv : NewPrice  => Price

    ob = em.OverlapBlocker()
    # attributes = ['Title', 'Price', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions']
    attributes = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date', 'Pages', 'Dimensions']
    # C = ob.block_tables(A, B, 'Title', 'Title', l_output_attrs=attributes,  r_output_attrs=attributes,
    #     overlap_size=1, show_progress=False)
    C = ob.block_tables(A, B, 'Title', 'Title', word_level=True, overlap_size=4, show_progress=True,
                        l_output_attrs=attributes, r_output_attrs=attributes)
    return C
Exemplo n.º 10
0
def block_walmart_amazon(A, B):
    #assumes some preprocessing is done:
    #Specifically in amazon.csv : a.	pcategory2  => groupname , b.	{ proddescrshort,proddescrlong } => shortdescr,longdescr

    ob = em.OverlapBlocker()

    #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2)
    #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154
    #blocking_utils.verify_blocking_ground_truth(dataset_name, C1)

    attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions']
    C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=3, l_output_attrs=attributes, r_output_attrs=attributes)
    #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154
    #blocking_utils.verify_blocking_ground_truth(dataset_name, C2)

    return C2
Exemplo n.º 11
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'record_id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'record_id')
         ob = mg.OverlapBlocker()
         self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2,
                              rem_stop_words = True,
                              l_output_attrs=['title', 'author', 'publisher', 'date'],
                              r_output_attrs=['title', 'author', 'publisher', 'date'])
         feature_table = mg.get_features_for_blocking(A,B)
         self.rb = mg.RuleBasedBlocker()
         self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
Exemplo n.º 12
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            ob = mg.OverlapBlocker()
            self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4,
			         l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'],
                                 r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'])
            feature_table = mg.get_features_for_blocking(A,B)
            self.rb = mg.RuleBasedBlocker()
            self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'],
                         feature_table)
        except AssertionError:
            print("Dataset \'beer\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit
Exemplo n.º 13
0
def block_wa(A, B):
    #assumes some preprocessing is done:
    #Specifically in amazon.csv : a.    pcategory2  => groupname , b.    { proddescrshort,proddescrlong } => shortdescr,longdescr

    ob = em.OverlapBlocker()

    #C1 = ob.block_tables(ltable, rtable, 'title', 'title', word_level=True, overlap_size=2)
    #=================>results in a candidate set of size 1.1M with 20 missing duplicates out of 1154
    #blocking_utils.verify_blocking_ground_truth(dataset_name, C1)

    r_attributes = ["title","category","brand","modelno","price"]
    l_attributes = ["title","category","brand","modelno","price"]

    if not set(r_attributes).issubset(B.columns): # fix in case A B are the same dataset
        r_attributes = l_attributes
    if not set(l_attributes).issubset(A.columns):
        l_attributes = r_attributes
    #attributes = ['brand', 'groupname', 'title', 'price', 'shortdescr', 'longdescr', 'imageurl', 'modelno', 'shipweight', 'dimensions']
    C2 = ob.block_tables(A, B, 'title', 'title', word_level=True, overlap_size=2, l_output_attrs=l_attributes, r_output_attrs=r_attributes, show_progress=True, allow_missing=True)
    #=================>results in a candidate set of size 278K with 84 missing duplicates out of 1154
    #blocking_utils.verify_blocking_ground_truth(dataset_name, C2)
    return C2
def overlapped_attribute_blocking(lhs_table, rhs_table, blocking_cols, min_shared_tokens, feature_cols, id_names, verbose = True, candidates = None):
    '''
    Overlapp Blocking Algorithm

    Inputs: 
        blocking_cols: list of length 2 indicating which columns in LHS table and RHS table should be used to measure overlap
                        Columns presented should be in the same order as id_names when generating to data sets
        feature_cols: list of length 2 indicating which columns to KEEP for further analysis.
        id_names: list of length 2 with the names of the ids
    Outputs
        Candidate Tuples -- a dataframe of Candidate Tuples across lhs_table to rhs_table

    '''
 
    overlap = em.OverlapBlocker()
    # Add id names ot feature cols
    feature_cols[0] += [id_names[0]]
    feature_cols[1] += [id_names[1]]


    # Decide if you are blocking a pair of TABLES or if you are blocking upon already generated candidate tuples
    if candidates is not None:
        candidate_pairs = overlap.block_candset(candidates,
            blocking_cols[0], blocking_cols[1], 
            word_level = True, overlap_size = min_shared_tokens, allow_missing=True,
            show_progress = verbose)
    else:
        candidate_pairs = overlap.block_tables(lhs_table, rhs_table,
            blocking_cols[0], blocking_cols[1], 
            word_level = True, overlap_size = min_shared_tokens, allow_missing=True, 
            l_output_attrs = feature_cols[0], 
            r_output_attrs = feature_cols[1],
            l_output_prefix = "", r_output_prefix = "",
            show_progress = verbose)

    return candidate_pairs
    A = preprocess_laptop_dataset(A)
    A.to_csv('X_cleaned.csv', index=False)
    sys.stderr.write("Reading Files Succeeded\n")

    # Reread the cleaned dataset
    A = em.read_csv_metadata('X_cleaned.csv', key='instance_id')
    B = em.read_csv_metadata('X_cleaned.csv', key='instance_id')

    sys.stderr.write('Number of tuples in A: ' + str(len(A)) + '\n')
    sys.stderr.write('Number of tuples in B: ' + str(len(B)) + '\n')
    sys.stderr.write(
        'Number of tuples in A X B (i.e the cartesian product): ' +
        str(len(A) * len(B)) + '\n')

    # Start running blocking
    ob = em.OverlapBlocker()

    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=[
            'instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
            'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
            'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'
        ],
        r_output_attrs=[
            'instance_id', 'brand', 'cpu_brand', 'cpu_model', 'cpu_type',
            'cpu_frequency', 'ram_capacity', 'ram_type', 'ram_frequency',
            'hdd_capacity', 'ssd_capacity', 'weight', 'dimensions', 'title'
Exemplo n.º 16
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_a)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_b)
     em.set_key(self.B, 'ID')
     self.ob = em.OverlapBlocker()
Exemplo n.º 17
0
def main():
    # WELCOME TO MY MAGELLAN RUN SCRIPT
    print("\n-------------WELCOME TO MY MAGELLAN RUN SCRIPT-------------\n")

    # Get the datasets directory
    datasets_dir = 'B:\McMaster\CAS 764 - Advance Topics in Data Management\Project\Data\\'
    print("- Dataset directory: " + datasets_dir)
    print("- List of folders/files: ")
    print(os.listdir(datasets_dir))
    print("- Please enter new dataset folder name:")
    datasets_dir += input()
    print("- Dataset directory set to: " + datasets_dir)

    dateset_dir_files = os.listdir(datasets_dir)
    print("- List of files in dataset folder: ")
    print(dateset_dir_files)

    # Get the path of the input table A
    print("- Enter an index for Table A file (0-x):")
    file_index_A = input()
    filename_A = dateset_dir_files[int(file_index_A)]
    print("Table A file set to: " + filename_A)

    # Get the path of the input table
    path_A = datasets_dir + os.sep + filename_A

    # Get the path of the input table B
    print("- Enter an index for Table B file (0-x):")
    file_index_B = input()
    filename_B = dateset_dir_files[int(file_index_B)]
    print("Table B file set to: " + filename_B)

    # Get the path of the input table
    path_B = datasets_dir + os.sep + filename_B

    # Print Table A column names
    A = em.read_csv_metadata(path_A)
    print("- List of columns of Table A: ")
    print(list(A.columns))
    # Get the Table A id/primary key column name
    print('- Enter Table A primary key column index (ex. 0):')
    pk_A_index = input()
    pk_A = A.columns[int(pk_A_index)]

    # Print Table B column names
    B = em.read_csv_metadata(path_B)
    print("- List of columns of Table B: ")
    print(list(B.columns))
    # Get the Table B id/primary key column name
    print('- Enter Table B primary key column index (ex. 0):')
    pk_B_index = input()
    pk_B = A.columns[int(pk_A_index)]

    # READING TABLES AND SETTING METADATA
    print("\n-------------READING TABLES AND SETTING METADATA-------------\n")

    # Both read csv and set metadata id as ID column
    #A = em.read_csv_metadata(path_A, key=pk_A)
    #B = em.read_csv_metadata(path_B, key=pk_B)
    em.set_key(A, pk_A)
    em.set_key(B, pk_B)

    # Number of tables
    print('- Number of tuples in A: ' + str(len(A)))
    print('- Number of tuples in B: ' + str(len(B)))
    print('- Number of tuples in A X B (i.e the cartesian product): ' +
          str(len(A) * len(B)))

    # Print first 5 tuples of tables
    print(A.head())
    print(B.head())

    # Display the keys of the input tables
    print("- Table A primary key: " + em.get_key(A))
    print("- Table B primary key: " + em.get_key(B))

    # DOWNSAMPLING
    print("\n-------------DOWNSAMPING-------------\n")

    print("- Do you want to use downsampling? (y or n):")
    print("- Table A: " + str(len(A)) + ", Table B: " + str(len(B)))
    print("- NOTE: Recommended if both tables have 100K+ tuples.")
    is_downsample = input()
    if (is_downsample == 'y'):
        print("- Size of the downsampled tables (ex. 200):")
        downsample_size = input()
        # If the tables are large we can downsample the tables like this
        A1, B1 = em.down_sample(A, B, downsample_size, 1, show_progress=False)
        print("- Length of Table A1" + len(A1))
        print("- Length of Table B1" + len(B1))

    # BLOCKING
    print("\n-------------BLOCKING-------------\n")

    print("- Do you want to use blocking? (y or n):")
    is_blocking = input()
    if (is_blocking == 'y'):

        # Check if the 2 tables column names are the same
        if (list(A.columns) == list(B.columns)):
            C_attr_eq = []  # Attr Equ blocker result list
            C_overlap = []  # Overlap blocker result list
            C_blackbox = []  # BlackBox blocker result list

            # Left and right table attribute prefixes
            l_prefix = "ltable_"
            r_prefix = "rtable_"

            print("\n- List of columns: ")
            print(list(A.columns))
            # Labeling output table column selection
            print(
                "\n- Enter the indexes of columns that you want to see in labeling table (0-"
                + str(len(A.columns) - 1) + "):")
            out_attr = []
            for i in range(1, len(A.columns)):
                print("- Finish with empty character(enter+enter) " + str(i))
                add_to_attr = input()
                if (add_to_attr == ''):
                    break
                # Get indexes from user and add columns into out_attr list
                out_attr.append(A.columns[int(add_to_attr)])

            # Print output attributes
            print(out_attr)

            # Loop for adding/combining new blockers
            while (True):
                # Blocker selection
                print(
                    "\n- Do yo want to use Attribute Equivalence[ab] (same), Overlap[ob] (similar) or Blackbox[bb] blocker:"
                )
                blocker_selection = input()

                # ----- Attribute Equivalence Blocker -----
                if (blocker_selection == 'ab'):
                    # Create attribute equivalence blocker
                    ab = em.AttrEquivalenceBlocker()
                    # Counter for indexes
                    attr_eq_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_overlap[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_attr_eq.append(
                                C_blackbox[-1])  # Add last output of ob
                            attr_eq_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Attr Equ blocker
                    while (True):
                        # List column names
                        print("\n- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "\n- Which column (w/ index) to use for equivalence blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        # First time using Attr Equ blocker, use A and B
                        if (attr_eq_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_attr_eq.append(
                                ab.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_attr_eq.append(
                                ab.block_candset(C_attr_eq[-1],
                                                 l_block_attr=blocking_col,
                                                 r_block_attr=blocking_col,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print(
                            "\n- Attribute Equivalence Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_attr_eq[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        attr_eq_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_attr_eq[-1])))
                        print(
                            "- Add another column into Attribute Equivalence Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ab_next_operation = input()
                        if (not ab_next_operation.islower()):
                            ab_next_operation = ab_next_operation.lower(
                            )  # Lower case
                        # Continue using Attribute Equivalence Blocker
                        if (ab_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ab_next_operation == 'r'):
                            del C_attr_eq[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Attribute Equivalence Blocker (y or n):"
                            )
                            ab_next_operation = input()
                            if (ab_next_operation == 'n'):
                                break
                        # Finish Attribute Equivalence Blocker
                        else:
                            break

                # ----- Overlap Blocker -----
                elif (blocker_selection == 'ob'):
                    # Create attribute equivalence blocker
                    ob = em.OverlapBlocker()
                    # Counter for indexes
                    overlap_counter = 0
                    # Check if Attribute Equivalence Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_attr_eq[-1])  # Add last output of ab
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Check if BlackBox Blocker used before
                    if (C_blackbox and not C_blackbox[-1].empty):
                        print(
                            "\n- Do you want to work on BlackBox Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_overlap.append(
                                C_blackbox[-1])  # Add last output of ob
                            overlap_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into Overlap blocker
                    while (True):
                        # List column names
                        print("- List of columns: ")
                        print(list(A.columns))
                        # Get blocking attribute/column
                        print(
                            "- Which column (w/ index) to use for overlap blocking? (ex. 1):"
                        )
                        blocking_col_index = input()
                        blocking_col = A.columns[int(blocking_col_index)]

                        print(
                            "\n- Do you want to add missing values into blocking? (y or n):"
                        )
                        add_missing_val = input()
                        if (add_missing_val == 'y'):
                            add_missing_val = True
                        else:
                            add_missing_val = False

                        print("\n- Use words as a token? (y or n):")
                        use_world_level = input()
                        if (use_world_level == 'y'):
                            use_world_level = True
                            q_gram_value = None
                        else:
                            use_world_level = False
                            print(
                                "\n- Q-gram q value (ex. 2 --> JO HN SM IT H):"
                            )
                            q_gram_value = input()
                            q_gram_value = int(q_gram_value)

                        print(
                            "\n- Enter the overlap size (# of tokens that overlap):"
                        )
                        overlap_size = input()
                        overlap_size = int(overlap_size)

                        print(
                            "\n- Do you want to remove (a, an, the) from token set? (y or n):"
                        )
                        use_stop_words = input()
                        if (use_stop_words == 'y'):
                            use_stop_words = True
                        else:
                            use_stop_words = False

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block using selected (blocking_col) attribute on A and B
                            C_overlap.append(
                                ob.block_tables(A,
                                                B,
                                                blocking_col,
                                                blocking_col,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                rem_stop_words=use_stop_words,
                                                q_val=q_gram_value,
                                                word_level=use_world_level,
                                                overlap_size=overlap_size,
                                                allow_missing=add_missing_val,
                                                n_jobs=-1))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block using selected (blocking_col) attribute on previous (last=-1) candidate set
                            C_overlap.append(
                                ob.block_candset(C_overlap[-1],
                                                 l_overlap_attr=blocking_col,
                                                 r_overlap_attr=blocking_col,
                                                 rem_stop_words=use_stop_words,
                                                 q_val=q_gram_value,
                                                 word_level=use_world_level,
                                                 overlap_size=overlap_size,
                                                 allow_missing=add_missing_val,
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- Overlap Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_overlap[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        overlap_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_overlap[-1])))
                        print(
                            "- Add another column into Overlap Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        ob_next_operation = input()
                        if (not ob_next_operation.islower()):
                            ob_next_operation = ob_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (ob_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (ob_next_operation == 'r'):
                            del C_overlap[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use Overlap Blocker (y or n):")
                            ob_next_operation = input()
                            if (ob_next_operation == 'n'):
                                break
                        # Finish Overlap Blocker
                        else:
                            break

                # ----- BlackBox Blocker -----
                elif (blocker_selection == 'bb'):
                    # Create attribute equivalence blocker
                    bb = em.BlackBoxBlocker()
                    # Counter for indexes
                    blackbox_counter = 0
                    # Check if Overlap Blocker used before
                    if (C_attr_eq and not C_attr_eq[-1].empty):
                        print(
                            "\n- Do you want to work on Attribute Equivalence Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_attr_eq[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Check if Overlap Blocker used before
                    if (C_overlap and not C_overlap[-1].empty):
                        print(
                            "\n- Do you want to work on Overlap Blocker candidate set or not (y or n):"
                        )
                        use_cand_set = input()
                        if (use_cand_set == 'y'):
                            C_blackbox.append(
                                C_overlap[-1])  # Add last output of ob
                            blackbox_counter += 1  # For skipping block_table function in first time

                    # Loop for adding more columns/attributes into BlackBox blocker
                    while (True):
                        # Set function
                        bb.set_black_box_function(
                            number_10_percent_comparision)

                        # First time using Overlap blocker, use A and B
                        if (overlap_counter == 0):
                            # Block on A and B
                            C_blackbox.append(
                                bb.block_tables(A,
                                                B,
                                                l_output_attrs=out_attr,
                                                r_output_attrs=out_attr,
                                                l_output_prefix=l_prefix,
                                                r_output_prefix=r_prefix,
                                                n_jobs=-1,
                                                show_progress=False))
                        # Not first time, add new constraint into previous candidate set
                        else:
                            # Block on previous (last=-1) candidate set
                            C_blackbox.append(
                                bb.block_candset(C_blackbox[-1],
                                                 n_jobs=-1,
                                                 show_progress=False))

                        # DEBUG BLOCKING
                        print("\n- BlackBox Blocker Debugging...\n")
                        # Debug last blocker output
                        dbg = em.debug_blocker(C_blackbox[-1],
                                               A,
                                               B,
                                               output_size=200,
                                               n_jobs=-1)

                        # Display first few tuple pairs from the debug_blocker's output
                        print("\n- Blocking debug results:")
                        print(dbg.head())

                        blackbox_counter += 1  # Increase the counter

                        # Continue to use Attribute Equivalence Blocker or not
                        print("\n- Length of candidate set: " +
                              str(len(C_blackbox[-1])))
                        print(
                            "- Add another column into BlackBox Blocker[a] OR Reset last blocker's output[r]:"
                        )
                        bb_next_operation = input()
                        if (not bb_next_operation.islower()):
                            bb_next_operation = bb_next_operation.lower(
                            )  # Lower case
                        # Continue using Overlap Blocker
                        if (bb_next_operation == 'a'):
                            continue
                        # Reset/remove last blocker's output from candidate set list
                        elif (bb_next_operation == 'r'):
                            del C_blackbox[-1]
                            print("\n- Last blocker output removed!")
                            print(
                                "- Continue to use BlackBox Blocker (y or n):")
                            bb_next_operation = input()
                            if (bb_next_operation == 'n'):
                                break
                        # Finish BlackBox Blocker
                        else:
                            break

                print("\n- Do you want to add/use another blocker? (y or n):")
                blocker_decision = input()
                if (blocker_decision == 'n'):
                    break

            print(
                "\n- Which blocker output you want to use? (Attr Equ-ab, Overlap-ob, BlackBox-bb, Union-un)"
            )
            blocker_output_selection = input()
            # Attribute Equ
            if (blocker_output_selection == "ab"):
                C = C_attr_eq[-1]
            # Overlap
            elif (blocker_output_selection == "ob"):
                C = C_overlap[-1]
                # Overlap
            elif (blocker_output_selection == "bb"):
                C = C_blackbox[-1]
            # Union of blockers
            elif (blocker_output_selection == "un"):
                # Combine/union blockers candidate sets
                print("\n- TODO: Unions Attr Equ and Overlap only!")
                if (C_attr_eq and C_overlap and not C_attr_eq[-1].empty and
                        not C_overlap[-1].empty):  # Both blocker types used
                    C = em.combine_blocker_outputs_via_union(
                        [C_attr_eq[-1], C_overlap[-1]])
                    print(
                        "\n- Blockers candidate set outputs combined via union."
                    )
                else:  # Error
                    C = []
                    print(
                        "\n- ERROR: Candidate set C is empty! Check blockers' results."
                    )
            # Error
            else:
                C = []
                print(
                    "\n- ERROR: Candidate set C is empty! Check blockers' results."
                )
            print("\n- Length of C: " + str(len(C)))

        else:
            print(
                "\n- 2 Tables column names are different, they must be the same"
            )
            print(list(A.columns))
            print(list(B.columns))

    # SAMPLING&LABELING
    print("\n-------------SAMPLING&LABELING-------------\n")

    print("- Choose sampling size (eg. 450):")
    sampling_size = input()
    while (int(sampling_size) > len(C)):
        print("- Sampling size cannot be bigger than " + str(len(C)))
        sampling_size = input()

    # Sample  candidate set
    S = em.sample_table(C, int(sampling_size))

    print("- New window will pop-up for " + sampling_size + " sized table.")
    print("- If there is a match, change tuple's label value to 1")

    # Label S
    G = em.label_table(S, 'label')

    #DEVELOPMENT AND EVALUATION
    print("\n-------------DEVELOPMENT AND EVALUATION-------------\n")

    # Split S into development set (I) and evaluation set (J)
    IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
    I = IJ['train']
    J = IJ['test']

    #SELECTING THE BEST MATCHER
    print("\n-------------SELECTING THE BEST MATCHER-------------\n")

    # Create a set of ML-matchers
    dt = em.DTMatcher(name='DecisionTree', random_state=0)
    svm = em.SVMMatcher(name='SVM', random_state=0)
    rf = em.RFMatcher(name='RF', random_state=0)
    lg = em.LogRegMatcher(name='LogReg', random_state=0)
    ln = em.LinRegMatcher(name='LinReg')
    nb = em.NBMatcher(name='NaiveBayes')

    print(
        "\n- 6 different ML-matchers created: DL, SVM, RF, LogReg, LinReg, NB")

    print("\n- Creating features...")
    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)

    print("\n- Features list:")
    # List the names of the features generated
    print(feature_table['feature_name'])

    print("\n- Converting the development set to feature vectors...")
    # Convert the I into a set of feature vectors using feature_table
    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    print("\n- Feature table first rows:")
    # Display first few rows
    print(H.head())

    # Primary key of tables = prefix + pk = l_id, r_id
    ltable_pk = l_prefix + pk_A
    rtable_pk = r_prefix + pk_B

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean',
            val_all_nans=0.0)
        #print("\n- Feature table first rows:")
        # Display first few rows
        #print(H.head())
        print("- Impute table function used for missing values.")

    print("\n- Selecting the best matcher using cross-validation...")
    # Select the best ML matcher using CV
    result = em.select_matcher(
        matchers=[dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)
    print("\n- Results:")
    print(result['cv_stats'])

    #DEBUGGING THE MATCHER
    print("\n-------------DEBUGGING THE MATCHER-------------\n")

    #  Split feature vectors into train and test
    UV = em.split_train_test(H, train_proportion=0.5)
    U = UV['train']
    V = UV['test']

    # Debug decision tree using GUI
    em.vis_debug_rf(rf,
                    U,
                    V,
                    exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
                    target_attr='label')

    print("\n- Do you want to add another feature?")

    H = em.extract_feature_vecs(I,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(H))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        H = em.impute_table(
            H,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(H.head())

    # Select the best ML matcher using CV
    result = em.select_matcher(
        [dt, rf, svm, ln, lg, nb],
        table=H,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        k=5,
        target_attr='label',
        metric_to_select_matcher='f1',
        random_state=0)

    print("\n- Results:")
    print(result['cv_stats'])

    #EVALUATING THE MATCHING OUTPUT
    print("\n-------------EVALUATING THE MATCHING OUTPUT-------------\n")

    print("\n- Converting the evaluation set to feature vectors...")
    # Convert J into a set of feature vectors using feature table
    L = em.extract_feature_vecs(J,
                                feature_table=feature_table,
                                attrs_after='label',
                                show_progress=False)

    # Check if the feature vectors contain missing values
    # A return value of True means that there are missing values
    is_missing_values = any(pd.notnull(L))
    print("\n- Does feature vector have missing values: " +
          str(is_missing_values))
    if (is_missing_values):
        # Impute feature vectors with the mean of the column values.
        L = em.impute_table(
            L,
            exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
            strategy='mean')
        print("\n- Feature table first rows:")
        # Display first few rows
        print(L.head())

    print("\n- Training the selected matcher...")
    # Train using feature vectors from I
    rf.fit(table=H,
           exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
           target_attr='label')

    print("\n- Predicting the matches...")
    # Predict on L
    predictions = rf.predict(
        table=L,
        exclude_attrs=['_id', ltable_pk, rtable_pk, 'label'],
        append=True,
        target_attr='predicted',
        inplace=False)

    print("\n- Evaluating the prediction...")
    # Evaluate the predictions
    eval_result = em.eval_matches(predictions, 'label', 'predicted')
    print(em.print_eval_summary(eval_result))

    print("\n- Time elapsed:")
    print(datetime.now() - startTime)

    print("\n-------------END-------------\n")
Exemplo n.º 18
0
# Write the benchmarking functions here.
# See "Writing benchmarks" in the asv docs for more information.

import os
import sys

import py_entitymatching as mg

p = mg.get_install_path()
datasets_path = os.sep.join([p, 'datasets', 'example_datasets'])
ob = mg.OverlapBlocker()


class TimeBlockTablesBooks:
    timeout = 500.0

    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'ID')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'ID')
        except AssertionError:
            print("Dataset \'books\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit

    def time_block_tables_title_2(self):
        ob.block_tables(self.A,
Exemplo n.º 19
0
def main(argv):
    topic = argv[0]
    lang = argv[1]

    flag = False
    excluded = ['_id', 'ltable_id', 'rtable_id', 'Label']

    path = "/home/oyku/datasets/"
    sample_path = path + "sample.csv"

    if topic == "uni":
        path = "/home/oyku/datasets/University/"
        ltable_blocker_field = "city"
        rtable_blocker_field = "city"

    elif topic == "movie":
        path = "/home/oyku/datasets/Movie/"
        ltable_blocker_field = "director"
        rtable_blocker_field = "director"

    elif topic == "title":
        path = "/home/oyku/datasets/Article/"
        ltable_blocker_field = "category"
        rtable_blocker_field = "category"

    en_csv = path + topic + "_en.csv"
    de_csv = path + topic + "_" + lang + ".csv"
    trans_de_csv = path + topic + "_" + lang + "_translated.csv"
    labeled_path = path + topic + "_" + lang + "_blocked_original.csv"
    tr_labeled_path = path + topic + "_" + lang + "_blocked_translated.csv"
    dup_file = path + topic + "_" + lang + "_duplicates.csv"

    labeled_df = pd.read_csv(labeled_path)
    tr_labeled_df = pd.read_csv(tr_labeled_path)

    features = path + "features/" + topic + "_" + lang + "_magellan_features.csv"
    tr_features = path + "features/" + topic + "_" + lang + "_translated_magellan_features.csv"

    tr_ft = pd.read_csv(tr_features)
    ft = pd.read_csv(features)

    A = em.read_csv_metadata(en_csv, key='id')
    B = em.read_csv_metadata(de_csv, key='id')
    T = em.read_csv_metadata(trans_de_csv, key='id')

    headerA = list(A)
    headerB = list(B)

    translated = True
    if not translated:
        rtable_blocker_field = 't_' + rtable_blocker_field
        T.rename(columns={ltable_blocker_field: rtable_blocker_field},
                 inplace=True)
        B = B.merge(T[['id', rtable_blocker_field]], on='id')
        em.set_key(B, 'id')
    else:
        B = T
        headerB = list(B)

    ob = em.OverlapBlocker()
    C = ob.block_tables(A,
                        B,
                        ltable_blocker_field,
                        rtable_blocker_field,
                        l_output_attrs=headerA,
                        r_output_attrs=headerB,
                        word_level=True,
                        overlap_size=2,
                        rem_stop_words=False,
                        allow_missing=flag)
    print("Shape of sampled: {}".format(C.shape))

    labeled_df = labelSample(C, dup_file)

    if translated:
        labeled_df.to_csv(tr_labeled_path, index=False)
    else:
        labeled_df.to_csv(labeled_path, index=False)
Exemplo n.º 20
0
def main():

    A = em.read_csv_metadata('ltable.csv',
                             key="ltable_id",
                             encoding='ISO-8859-1')
    B = em.read_csv_metadata('rtable.csv',
                             key="rtable_id",
                             encoding='ISO-8859-1')

    ob = em.OverlapBlocker()
    C = ob.block_tables(
        A,
        B,
        'title',
        'title',
        l_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        r_output_attrs=['title', 'category', 'brand', 'modelno', 'price'],
        overlap_size=1,
        show_progress=False)
    S = em.sample_table(C, 450)

    G = em.read_csv_metadata("train.csv",
                             key='id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_id',
                             fk_rtable='rtable_id')
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    G = em.label_table(S, 'label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    H = em.extract_feature_vecs(G,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                attrs_after='label',
                                show_progress=False)
    H.fillna('0', inplace=True)
    #     H = em.impute_table(
    #         H, exclude_attrs=['_id', 'ltable_ltable_id', 'rtable_rtable_id','label'], strategy='mean')
    rf = em.RFMatcher()

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id', 'label'])
    attrs_to_be_excluded.extend(attrs_from_table)

    rf.fit(table=H, exclude_attrs=attrs_to_be_excluded, target_attr='label')

    attrs_from_table = [
        'ltable_title', 'ltable_category', 'ltable_brand', 'ltable_modelno',
        'ltable_price', 'rtable_title', 'rtable_category', 'rtable_brand',
        'rtable_modelno', 'rtable_price'
    ]
    L = em.extract_feature_vecs(C,
                                feature_table=feature_table,
                                attrs_before=attrs_from_table,
                                show_progress=False,
                                n_jobs=-1)

    attrs_to_be_excluded = []
    attrs_to_be_excluded.extend(
        ['_id', 'ltable_ltable_id', 'rtable_rtable_id'])
    attrs_to_be_excluded.extend(attrs_from_table)

    predictions = rf.predict(table=L,
                             exclude_attrs=attrs_to_be_excluded,
                             append=True,
                             target_attr='predicted',
                             inplace=False)

    dataset = pd.DataFrame({"id": G[0]['id'], 'label': predictions['label']})
    dataset.to_csv("./prediction2.csv", index=False)
Exemplo n.º 21
0
def block_amazon_googleproducts(A, B):
    ob = em.OverlapBlocker()
    #=================>results in a candidate set of size 400K with 6 missing duplicates out of 1300
    C = ob.block_tables(A, B, "title", "title", word_level=True, overlap_size=1, l_output_attrs=["title","description","manufacturer","price"], r_output_attrs=["title","description","manufacturer","price"], show_progress=True, allow_missing=True)
    return C
Exemplo n.º 22
0
def block_fodors_zagats(A, B):
    ob = em.OverlapBlocker()
    #No misses
    C = ob.block_tables(A, B, 'name', 'name', l_output_attrs=['name', 'addr', 'city', 'phone'],  r_output_attrs=['name', 'addr', 'city', 'phone'],
            overlap_size=1, show_progress=False)
    return C
Exemplo n.º 23
0
import py_entitymatching as em
import pandas as pd

table_a = em.read_csv_metadata('anime_1.csv', key='ID')
table_b = em.read_csv_metadata('animePlayer.csv', key='ID')

type_blocker = em.AttrEquivalenceBlocker()
type_blocker_result = type_blocker.block_tables(table_a, table_b, 'Type',
                                                'Type')
print(type_blocker_result)

overlap_blocker = em.OverlapBlocker()
overlap_blocker_result = overlap_blocker.block_candset(type_blocker_result,
                                                       'Genres',
                                                       'Genres',
                                                       overlap_size=4)
print(overlap_blocker_result)

year_blocker = em.OverlapBlocker()
year_blocker_result = year_blocker.block_candset(overlap_blocker_result,
                                                 'Year',
                                                 'Year',
                                                 q_val=4,
                                                 word_level=False,
                                                 overlap_size=4)
print(year_blocker_result)

year_blocker_result.to_csv('candidate_set.csv')

#max = overlap_blocker_result.max()
#print '%s, %s, %s' % (max['_id'], max['ltable_ID'], max['rtable_ID'])