예제 #1
0
def _overlap_coefficient_join_split(
        ltable_list, rtable_list, l_columns, r_columns, l_key_attr, r_key_attr,
        l_join_attr, r_join_attr, tokenizer, threshold, comp_op, allow_empty,
        l_out_attrs, r_out_attrs, l_out_prefix, r_out_prefix, out_sim_score,
        show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list,
                                   l_join_attr_index,
                                   tokenizer,
                                   cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None
                             or r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining
        # the current rtable record with those records in ltable with empty set
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[l_id], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[l_id][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates
        candidate_overlap = overlap_filter.find_candidates(
            r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score
            sim_score = (
                float(overlap) /
                float(min(r_num_tokens, inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                        ltable_list[cand], r_row, l_key_attr_index,
                        r_key_attr_index, l_out_attrs_indices,
                        r_out_attrs_indices)
                else:
                    output_row = [
                        ltable_list[cand][l_key_attr_index],
                        r_row[r_key_attr_index]
                    ]

                # if out_sim_score flag is set, append the overlap coefficient
                # score to the output record.
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #2
0
def sample_pairs(ltable,
                 rtable,
                 l_key_attr,
                 r_key_attr,
                 l_join_attr,
                 r_join_attr,
                 sample_size,
                 y_param,
                 seed,
                 l_out_prefix='l_',
                 r_out_prefix='r_',
                 show_progress=True):
    # get attributes to project.
    l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr)
    r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr)

    # convert dataframe to array for faster access
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs,
                                              l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs,
                                              r_join_attr)

    # find column indices of key attr and join attr in ltable array
    l_key_attr_index = l_proj_attrs.index(l_key_attr)
    l_join_attr_index = l_proj_attrs.index(l_join_attr)

    # find column indices of key attr and join attr in rtable array
    r_key_attr_index = r_proj_attrs.index(r_key_attr)
    r_join_attr_index = r_proj_attrs.index(r_join_attr)

    # create a whitespace tokenizer to tokenize join attributes
    ws_tok = WhitespaceTokenizer(return_set=True)

    # build inverted index on join attriubute in ltable
    inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok)
    inverted_index.build()

    number_of_r_tuples_to_sample = int(
        ceil(float(sample_size) / float(y_param)))
    sample_rtable_indices = random.sample(range(0, len(rtable_array)),
                                          number_of_r_tuples_to_sample)
    cand_pos_ltuples_required = int(ceil(y_param / 2.0))

    overlap_filter = OverlapFilter(ws_tok, 1)

    output_rows = []

    if show_progress:
        prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample)

    for r_idx in sample_rtable_indices:
        r_row = rtable_array[r_idx]
        r_id = r_row[r_key_attr_index]
        r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index])

        # probe inverted index and find ltable candidates
        cand_overlap = overlap_filter.find_candidates(r_join_attr_tokens,
                                                      inverted_index)

        sampled_ltuples = set()
        for cand in sorted(cand_overlap.items(),
                           key=operator.itemgetter(1),
                           reverse=True):
            if len(sampled_ltuples) == cand_pos_ltuples_required:
                break
            sampled_ltuples.add(cand[0])

        ltable_size = len(ltable_array)
        while len(sampled_ltuples) < y_param:
            rand_idx = random.randint(0, ltable_size - 1)
            sampled_ltuples.add(rand_idx)

        for l_idx in sampled_ltuples:
            output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id])

        if show_progress:
            prog_bar.update()

    for seed_pair_row in seed.itertuples(index=False):
        output_rows.append([seed_pair_row[0], seed_pair_row[1]])

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr, None,
                                                  None, l_out_prefix,
                                                  r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)

    # add an id column named '_id' to the output table.
    output_table.insert(0, '_id', range(0, len(output_table)))

    return output_table
def _overlap_coefficient_join_split(ltable_list, rtable_list,
                                    l_columns, r_columns,
                                    l_key_attr, r_key_attr,
                                    l_join_attr, r_join_attr,
                                    tokenizer, threshold, comp_op,
                                    allow_empty,
                                    l_out_attrs, r_out_attrs,
                                    l_out_prefix, r_out_prefix,
                                    out_sim_score, show_progress):
    """Perform overlap coefficient join for a split of ltable and rtable"""
    # find column indices of key attr, join attr and output attrs in ltable
    l_key_attr_index = l_columns.index(l_key_attr)
    l_join_attr_index = l_columns.index(l_join_attr)
    l_out_attrs_indices = find_output_attribute_indices(l_columns, l_out_attrs)

    # find column indices of key attr, join attr and output attrs in rtable
    r_key_attr_index = r_columns.index(r_key_attr)
    r_join_attr_index = r_columns.index(r_join_attr)
    r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)

    # Build inverted index over ltable
    inverted_index = InvertedIndex(ltable_list, l_join_attr_index,
                                   tokenizer, cache_size_flag=True)
    # While building the index, we cache the record ids with empty set of 
    # tokens. This is needed to handle the allow_empty flag.
    cached_data = inverted_index.build(allow_empty)
    l_empty_records = cached_data['empty_records']

    overlap_filter = OverlapFilter(tokenizer, 1)
    comp_fn = COMP_OP_MAP[comp_op]

    output_rows = []
    has_output_attributes = (l_out_attrs is not None or
                             r_out_attrs is not None)

    if show_progress:
        prog_bar = pyprind.ProgBar(len(rtable_list))

    for r_row in rtable_list:
        r_string = r_row[r_join_attr_index]

        r_join_attr_tokens = tokenizer.tokenize(r_string)
        r_num_tokens = len(r_join_attr_tokens)

        # If allow_empty flag is set and the current rtable record has empty set
        # of tokens in the join attribute, then generate output pairs joining   
        # the current rtable record with those records in ltable with empty set 
        # of tokens in the join attribute. These ltable record ids are cached in
        # l_empty_records list which was constructed when building the inverted 
        # index.
        if allow_empty and r_num_tokens == 0:
            for l_id in l_empty_records:
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[l_id], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices,
                                     r_out_attrs_indices)
                else:
                    output_row = [ltable_list[l_id][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                if out_sim_score:
                    output_row.append(1.0)
                output_rows.append(output_row)
            continue

        # probe inverted index and find overlap of candidates 
        candidate_overlap = overlap_filter.find_candidates(
                                r_join_attr_tokens, inverted_index)

        for cand, overlap in iteritems(candidate_overlap):
            # compute the actual similarity score                           
            sim_score = (float(overlap) /
                         float(min(r_num_tokens,
                                   inverted_index.size_cache[cand])))

            if comp_fn(sim_score, threshold):
                if has_output_attributes:
                    output_row = get_output_row_from_tables(
                                     ltable_list[cand], r_row,
                                     l_key_attr_index, r_key_attr_index,
                                     l_out_attrs_indices, r_out_attrs_indices)
                else:
                    output_row = [ltable_list[cand][l_key_attr_index],
                                  r_row[r_key_attr_index]]

                # if out_sim_score flag is set, append the overlap coefficient 
                # score to the output record.  
                if out_sim_score:
                    output_row.append(sim_score)

                output_rows.append(output_row)

        if show_progress:
            prog_bar.update()

    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,
                                                  l_out_attrs, r_out_attrs,
                                                  l_out_prefix, r_out_prefix)
    if out_sim_score:
        output_header.append("_sim_score")

    output_table = pd.DataFrame(output_rows, columns=output_header)
    return output_table
예제 #4
0
def sample_pairs(ltable, rtable, l_key_attr, r_key_attr, 
                 l_join_attr, r_join_attr, sample_size, y_param, seed,
                 l_out_prefix='l_', r_out_prefix='r_', show_progress=True):
    # get attributes to project.                                                
    l_proj_attrs = get_attrs_to_project(None, l_key_attr, l_join_attr)   
    r_proj_attrs = get_attrs_to_project(None, r_key_attr, r_join_attr)  

    # convert dataframe to array for faster access       
    ltable_array = convert_dataframe_to_array(ltable, l_proj_attrs, l_join_attr)
    rtable_array = convert_dataframe_to_array(rtable, r_proj_attrs, r_join_attr)

    # find column indices of key attr and join attr in ltable array                  
    l_key_attr_index = l_proj_attrs.index(l_key_attr)                              
    l_join_attr_index = l_proj_attrs.index(l_join_attr)                            
                                                                                
    # find column indices of key attr and join attr in rtable array                   
    r_key_attr_index = r_proj_attrs.index(r_key_attr)                              
    r_join_attr_index = r_proj_attrs.index(r_join_attr)  

    # create a whitespace tokenizer to tokenize join attributes                 
    ws_tok = WhitespaceTokenizer(return_set=True)     

    # build inverted index on join attriubute in ltable
    inverted_index = InvertedIndex(ltable_array, l_join_attr_index, ws_tok)
    inverted_index.build()

    number_of_r_tuples_to_sample = int(ceil(float(sample_size) / float(y_param)))   
    sample_rtable_indices = random.sample(range(0, len(rtable_array)),
                                          number_of_r_tuples_to_sample)
    cand_pos_ltuples_required = int(ceil(y_param / 2.0))                    

    overlap_filter = OverlapFilter(ws_tok, 1)                                

    output_rows = [] 

    if show_progress:                                                           
        prog_bar = pyprind.ProgBar(number_of_r_tuples_to_sample)    

    for r_idx in sample_rtable_indices:
        r_row = rtable_array[r_idx]
        r_id = r_row[r_key_attr_index]
        r_join_attr_tokens = ws_tok.tokenize(r_row[r_join_attr_index])

        # probe inverted index and find ltable candidates                   
        cand_overlap = overlap_filter.find_candidates(                     
                           r_join_attr_tokens, inverted_index)          

        sampled_ltuples = set() 
        for cand in sorted(cand_overlap.items(), key=operator.itemgetter(1), 
                           reverse=True):
            if len(sampled_ltuples) == cand_pos_ltuples_required:
                break 
            sampled_ltuples.add(cand[0])

        ltable_size = len(ltable_array)
        while len(sampled_ltuples) < y_param:
            rand_idx = random.randint(0, ltable_size - 1)
            sampled_ltuples.add(rand_idx)

        for l_idx in sampled_ltuples:
            output_rows.append([ltable_array[l_idx][l_key_attr_index], r_id])

        if show_progress:                                                       
            prog_bar.update()

    for seed_pair_row in seed.itertuples(index=False):                          
        output_rows.append([seed_pair_row[0], seed_pair_row[1]])
   
    output_header = get_output_header_from_tables(l_key_attr, r_key_attr,       
                                                  None, None,     
                                                  l_out_prefix, r_out_prefix)

    output_table = pd.DataFrame(output_rows, columns=output_header)
             
    # add an id column named '_id' to the output table.                         
    output_table.insert(0, '_id', range(0, len(output_table)))    

    return output_table