def get_rltk_block(self) -> typing.Optional[rltk.Block]: prime_key_l = [] prime_key_r = [] str_key_l = [] str_key_r = [] for f1, f2 in self.pairs: if f1.data_type == DataType.STRING: # 2019.4.10: TOKEN_CATEGORICAL should also considered here if f1.distribute_type == DistributeType.CATEGORICAL or f1.distribute_type == DistributeType.TOKEN_CATEGORICAL: prime_key_l.append(f1.name) prime_key_r.append(f2.name) elif f1.distribute_type == DistributeType.NON_CATEGORICAL: str_key_l.append(f1.name) str_key_r.append(f2.name) if prime_key_l and prime_key_r: try: bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(self.left_rltk_dataset, function_=lambda r: ''.join([ str(getattr(r, pk)).lower() for pk in prime_key_l ])), bg.block(self.right_rltk_dataset, function_=lambda r: ''.join([ str(getattr(r, pk)).lower() for pk in prime_key_r ]))) return block except Exception as e: print(' - BLOCKING EXCEPTION: %s' % str(e)) raise ValueError("failed to get blocking!") # if the datasets are too large, use each key's first char as blocking key if str_key_l and str_key_r and len(self._left_df) * len( self._right_df) > 10000: try: bg = rltk.HashBlockGenerator() block = bg.generate( # original: str(getattr(r, pk))[0] bg.block( self.left_rltk_dataset, function_=lambda r: ''.join( [str(getattr(r, pk)).lower() for pk in str_key_l])), bg.block( self.right_rltk_dataset, function_=lambda r: ''.join( [str(getattr(r, pk)).lower() for pk in str_key_r]))) return block except Exception as e: print(' - BLOCKING EXCEPTION: %s' % str(e))
def find_pair( self, left_df: pd.DataFrame, right_df: pd.DataFrame ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) pairs = rltk.get_record_pairs(ds1, ds2, block=block) pairs_list = [] pairs_column = [[] for _ in range(right_df.shape[0])] for r1, r2 in pairs: pairs_column[int(r2.id)].append(int(r1.id)) pairs_list.append((r1.id, r2.id)) right_df["joining_pairs"] = pairs_column return right_df, pairs_list
def create_hash_blocks(dataset_1: rltk.Dataset, dataset_2: rltk.Dataset) -> rltk.block: ''' Create and return rltk hash blocks ''' # ################################################## # ** STUDENT CODE. Task 2.2 (part 1) # TODO: Implement create_hash_blocks. # Your code should implement a hash blocking method using rltk.HashBlockGenerator(). # The hashing property should be the attribute 'title_first_2_letters'. # Your implementation should be inside this ####### block bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(dataset_1, property_='title_first_2_letters'), bg.block(dataset_2, property_='title_first_2_letters')) return block
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict, right_metadata: dict) -> JoinResult: class left(rltk.AutoGeneratedRecord): pass class right(rltk.AutoGeneratedRecord): pass left_df['id'] = left_df.index.astype(str) right_df['id'] = right_df.index.astype(str) if 'Unnamed: 0' in right_df.columns: right_df = right_df.drop(columns=['Unnamed: 0']) ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left) ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right) bg = rltk.HashBlockGenerator() block = bg.generate( bg.block(ds1, property_=self.join_target_column_names[0]), bg.block(ds2, property_=self.join_target_column_names[1])) left_df = left_df.set_index('id') right_df = right_df.set_index('id') pairs = rltk.get_record_pairs(ds1, ds2, block=block) df_joined = pd.DataFrame() column_names_to_join = None for r1, r2 in pairs: left_res = left_df.loc[r1.id] right_res = right_df.loc[r2.id] if column_names_to_join is None: column_names_to_join = right_res.index.difference( left_res.index) matched_rows = right_res.index.intersection(left_res.index) columns_new = left_res.index.tolist() columns_new.extend(column_names_to_join.tolist()) new = pd.concat([left_res, right_res[column_names_to_join]]) df_joined = df_joined.append(new, ignore_index=True) # ensure that the original dataframe columns are at the first left part df_joined = df_joined[columns_new] return JoinResult(df_joined, matched_rows)
def create_block_reader(ds1: rltk.Dataset, ds2: rltk.Dataset): chunkNum = {} def blockingFunc(r): chunk = sum(int(x) for x in r.phone if x.isdigit()) % 11 ''' if chunk in chunkNum: chunkNum[chunk] += 1 else: chunkNum[chunk] = 1 ''' return str(chunk) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, function_=blockingFunc), bg.block(ds2, function_=blockingFunc)) #print(chunkNum) return block
class Record2(rltk.Record): @rltk.cached_property def id(self): return self.raw_object['ident'] @rltk.cached_property def first_name(self): return self.raw_object['name'].split(' ')[0] @rltk.cached_property def last_name(self): return self.raw_object['name'].split(' ')[1] def blocking_function(r): return r.first_name[:1] ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1) ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2) block_handler = rltk.HashBlockGenerator( ds1, ds2, hash_function=blocking_function, writer=rltk.BlockArrayWriter()).generate() pairs = rltk.get_record_pairs(ds1, ds2, rltk.BlockArrayReader(block_handler)) for r1, r2 in pairs: print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
def main(): with open("dblp_final_JSON.json", "r") as f: dblp_dict = json.load(f) professors = set() for key in dblp_dict: professors.add(key['person']) #print(professors) #print(len(professors)) coauthor_dict = defaultdict(list) for key in dblp_dict: author = key['person'] for items in key['papers']: co_authors = items['co_authors'] if author in co_authors: co_authors.remove(author) if co_authors: coauthor_dict[author].extend(co_authors) list_of_coauthors = [] for key in coauthor_dict: list_of_coauthors.extend(coauthor_dict[key]) #print(len(list_of_coauthors)) ### String / Data Matching for Entity linking using RLTK ### Remove duplicates in the coauthor_dict using String Matching ### Compare with professors and do entity linking / remove duplicates df1 = pd.DataFrame(list(professors), columns=['name']) #print(df1) df2 = pd.DataFrame(list_of_coauthors, columns=['name']) #print(len(df2)) df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1) df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df1['id'] = (df1.index + 1).astype(str) #print(df1) df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1) df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df2['id'] = (df2.index + 1).astype(str) ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, property_='fname'), bg.block(ds2, property_='fname')) pairs = rltk.get_record_pairs(ds1, ds2, block=block) num_pairs = 0 sim_pairs = [] sim_dict = {} for r1, r2 in pairs: num_pairs += 1 sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname) if 0.9 < sim < 1: sim_pairs.append( (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname)) sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname #print(r1.lname,r2.lname,sim) #print(sim_pairs) #print("Blocking using Cuisine - Number of pairs:",num_pairs) for key in coauthor_dict: lis = coauthor_dict[key] for ind in range(len(lis)): if lis[ind] in sim_dict: lis[ind] = sim_dict[lis[ind]] with open("co_authors.json", "w") as jf: json.dump(coauthor_dict, jf, indent=2)