示例#1
0
    def get_rltk_block(self) -> typing.Optional[rltk.Block]:
        prime_key_l = []
        prime_key_r = []
        str_key_l = []
        str_key_r = []
        for f1, f2 in self.pairs:
            if f1.data_type == DataType.STRING:
                # 2019.4.10: TOKEN_CATEGORICAL should also considered here
                if f1.distribute_type == DistributeType.CATEGORICAL or f1.distribute_type == DistributeType.TOKEN_CATEGORICAL:
                    prime_key_l.append(f1.name)
                    prime_key_r.append(f2.name)
                elif f1.distribute_type == DistributeType.NON_CATEGORICAL:
                    str_key_l.append(f1.name)
                    str_key_r.append(f2.name)

        if prime_key_l and prime_key_r:
            try:
                bg = rltk.HashBlockGenerator()
                block = bg.generate(
                    bg.block(self.left_rltk_dataset,
                             function_=lambda r: ''.join([
                                 str(getattr(r, pk)).lower()
                                 for pk in prime_key_l
                             ])),
                    bg.block(self.right_rltk_dataset,
                             function_=lambda r: ''.join([
                                 str(getattr(r, pk)).lower()
                                 for pk in prime_key_r
                             ])))
                return block
            except Exception as e:
                print(' - BLOCKING EXCEPTION: %s' % str(e))
                raise ValueError("failed to get blocking!")

        # if the datasets are too large, use each key's first char as blocking key
        if str_key_l and str_key_r and len(self._left_df) * len(
                self._right_df) > 10000:
            try:
                bg = rltk.HashBlockGenerator()
                block = bg.generate(
                    # original: str(getattr(r, pk))[0]
                    bg.block(
                        self.left_rltk_dataset,
                        function_=lambda r: ''.join(
                            [str(getattr(r, pk)).lower()
                             for pk in str_key_l])),
                    bg.block(
                        self.right_rltk_dataset,
                        function_=lambda r: ''.join(
                            [str(getattr(r, pk)).lower()
                             for pk in str_key_r])))
                return block
            except Exception as e:
                print(' - BLOCKING EXCEPTION: %s' % str(e))
    def find_pair(
        self, left_df: pd.DataFrame, right_df: pd.DataFrame
    ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        pairs_list = []
        pairs_column = [[] for _ in range(right_df.shape[0])]
        for r1, r2 in pairs:
            pairs_column[int(r2.id)].append(int(r1.id))
            pairs_list.append((r1.id, r2.id))

        right_df["joining_pairs"] = pairs_column
        return right_df, pairs_list
def create_hash_blocks(dataset_1: rltk.Dataset,
                       dataset_2: rltk.Dataset) -> rltk.block:
    ''' Create and return rltk hash blocks '''
    # ##################################################
    # ** STUDENT CODE. Task 2.2 (part 1)
    # TODO: Implement create_hash_blocks.
    #       Your code should implement a hash blocking method using rltk.HashBlockGenerator().
    #       The hashing property should be the attribute 'title_first_2_letters'.
    #       Your implementation should be inside this ####### block
    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(dataset_1, property_='title_first_2_letters'),
                        bg.block(dataset_2, property_='title_first_2_letters'))

    return block
    def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]], left_metadata: dict,
             right_metadata: dict) -> JoinResult:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))
        left_df = left_df.set_index('id')
        right_df = right_df.set_index('id')

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        df_joined = pd.DataFrame()

        column_names_to_join = None
        for r1, r2 in pairs:
            left_res = left_df.loc[r1.id]
            right_res = right_df.loc[r2.id]
            if column_names_to_join is None:
                column_names_to_join = right_res.index.difference(
                    left_res.index)
                matched_rows = right_res.index.intersection(left_res.index)
                columns_new = left_res.index.tolist()
                columns_new.extend(column_names_to_join.tolist())
            new = pd.concat([left_res, right_res[column_names_to_join]])
            df_joined = df_joined.append(new, ignore_index=True)
        # ensure that the original dataframe columns are at the first left part
        df_joined = df_joined[columns_new]

        return JoinResult(df_joined, matched_rows)
示例#5
0
def create_block_reader(ds1: rltk.Dataset, ds2: rltk.Dataset):
    chunkNum = {}

    def blockingFunc(r):
        chunk = sum(int(x) for x in r.phone if x.isdigit()) % 11
        '''
        if chunk in chunkNum:
            chunkNum[chunk] += 1
        else:
            chunkNum[chunk] = 1        
        '''

        return str(chunk)

    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(ds1, function_=blockingFunc),
                        bg.block(ds2, function_=blockingFunc))
    #print(chunkNum)
    return block
示例#6
0
class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]


def blocking_function(r):
    return r.first_name[:1]


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1)
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2)

block_handler = rltk.HashBlockGenerator(
    ds1, ds2, hash_function=blocking_function,
    writer=rltk.BlockArrayWriter()).generate()

pairs = rltk.get_record_pairs(ds1, ds2, rltk.BlockArrayReader(block_handler))
for r1, r2 in pairs:
    print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
def main():
    with open("dblp_final_JSON.json", "r") as f:
        dblp_dict = json.load(f)

    professors = set()
    for key in dblp_dict:
        professors.add(key['person'])

    #print(professors)
    #print(len(professors))

    coauthor_dict = defaultdict(list)
    for key in dblp_dict:
        author = key['person']
        for items in key['papers']:
            co_authors = items['co_authors']
            if author in co_authors:
                co_authors.remove(author)
            if co_authors:
                coauthor_dict[author].extend(co_authors)

    list_of_coauthors = []
    for key in coauthor_dict:
        list_of_coauthors.extend(coauthor_dict[key])
    #print(len(list_of_coauthors))

    ### String / Data Matching for Entity linking using RLTK

    ### Remove duplicates in the coauthor_dict using String Matching
    ### Compare with professors and do entity linking / remove duplicates

    df1 = pd.DataFrame(list(professors), columns=['name'])
    #print(df1)
    df2 = pd.DataFrame(list_of_coauthors, columns=['name'])
    #print(len(df2))
    df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1)
    df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df1['id'] = (df1.index + 1).astype(str)

    #print(df1)
    df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1)
    df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df2['id'] = (df2.index + 1).astype(str)

    ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1),
                       record_class=Record1,
                       adapter=rltk.MemoryKeyValueAdapter())
    ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2),
                       record_class=Record2,
                       adapter=rltk.MemoryKeyValueAdapter())
    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(ds1, property_='fname'),
                        bg.block(ds2, property_='fname'))
    pairs = rltk.get_record_pairs(ds1, ds2, block=block)
    num_pairs = 0
    sim_pairs = []
    sim_dict = {}
    for r1, r2 in pairs:
        num_pairs += 1
        sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname)
        if 0.9 < sim < 1:
            sim_pairs.append(
                (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname))
            sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname
            #print(r1.lname,r2.lname,sim)
    #print(sim_pairs)
    #print("Blocking using Cuisine - Number of pairs:",num_pairs)
    for key in coauthor_dict:
        lis = coauthor_dict[key]
        for ind in range(len(lis)):
            if lis[ind] in sim_dict:
                lis[ind] = sim_dict[lis[ind]]

    with open("co_authors.json", "w") as jf:
        json.dump(coauthor_dict, jf, indent=2)