def worker(): tokenizer = rltk.CrfTokenizer() # load Datasets ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) valid_match = [] for r_imdb in ds_imdb: # test this record with AFI records optimum = (None, MY_TRESH) for r_afi in ds_afi: result, confidence = rule_based_method(r_imdb, r_afi) if result and confidence > optimum[1]: optimum = (r_afi, confidence) if optimum[0] is not None: r_afi, confidence = optimum valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': r_afi.raw_object['url'] }) else: valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': None }) fout = open(result_file, 'w') fout.write(json.dumps(valid_match, indent=4)) fout.close()
return set(self.genre_string.split(',')) @rltk.cached_property def year(self): if re.search("(\d{4})", self.date_string): return str(re.search("(\d{4})", self.date_string).group(0)) else: return '' imdb_file = 'imdb.jl' afi_file = 'afi.jl' ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) def name_similarity(r_imdb, r_afi): s1 = r_imdb.name_string s2 = r_afi.name_string return rltk.jaro_winkler_similarity(s1, s2) def genre_similarity(r_imdb, r_afi): s1 = r_imdb.genre_set s2 = r_afi.genre_set return rltk.jaccard_index_similarity(s1, s2)
def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset: ''' Create rltk dataset from a given jl file ''' assert Path(input_file).suffix == ".jl" return rltk.Dataset(reader=rltk.JsonLinesReader(input_file), record_class=rcrd_class, adapter=rltk.MemoryKeyValueAdapter())
def main(): with open("dblp_final_JSON.json", "r") as f: dblp_dict = json.load(f) professors = set() for key in dblp_dict: professors.add(key['person']) #print(professors) #print(len(professors)) coauthor_dict = defaultdict(list) for key in dblp_dict: author = key['person'] for items in key['papers']: co_authors = items['co_authors'] if author in co_authors: co_authors.remove(author) if co_authors: coauthor_dict[author].extend(co_authors) list_of_coauthors = [] for key in coauthor_dict: list_of_coauthors.extend(coauthor_dict[key]) #print(len(list_of_coauthors)) ### String / Data Matching for Entity linking using RLTK ### Remove duplicates in the coauthor_dict using String Matching ### Compare with professors and do entity linking / remove duplicates df1 = pd.DataFrame(list(professors), columns=['name']) #print(df1) df2 = pd.DataFrame(list_of_coauthors, columns=['name']) #print(len(df2)) df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1) df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df1['id'] = (df1.index + 1).astype(str) #print(df1) df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1) df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]), axis=1) df2['id'] = (df2.index + 1).astype(str) ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1), record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2), record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) bg = rltk.HashBlockGenerator() block = bg.generate(bg.block(ds1, property_='fname'), bg.block(ds2, property_='fname')) pairs = rltk.get_record_pairs(ds1, ds2, block=block) num_pairs = 0 sim_pairs = [] sim_dict = {} for r1, r2 in pairs: num_pairs += 1 sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname) if 0.9 < sim < 1: sim_pairs.append( (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname)) sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname #print(r1.lname,r2.lname,sim) #print(sim_pairs) #print("Blocking using Cuisine - Number of pairs:",num_pairs) for key in coauthor_dict: lis = coauthor_dict[key] for ind in range(len(lis)): if lis[ind] in sim_dict: lis[ind] = sim_dict[lis[ind]] with open("co_authors.json", "w") as jf: json.dump(coauthor_dict, jf, indent=2)