def cluster_sets_from_marktables(): # { (100, 123) -> name } ref100 = get_bib10x() ref700 = get_bib70x() bibref_2_name = dict([((100, ref), generate_last_name_cluster_str(name)) for ref, name in ref100] + [((700, ref), generate_last_name_cluster_str(name)) for ref, name in ref700]) all_recs = get_all_valid_bibrecs() all_bibrefrecs = chain(set((100, ref, rec) for rec, ref in get_bibrefrec_subset(100, all_recs, map(itemgetter(0), ref100))), set((700, ref, rec) for rec, ref in get_bibrefrec_subset(700, all_recs, map(itemgetter(0), ref700)))) last_name_2_bibs = {} for bibrefrec in all_bibrefrecs: table, ref, unused = bibrefrec name = bibref_2_name[(table, ref)] last_name_2_bibs[name] = last_name_2_bibs.get(name, []) + [bibrefrec] cluster_sets = [] for name, bibrecrefs in last_name_2_bibs.items(): new_cluster_set = Cluster_set() new_cluster_set.clusters = [Cluster_set.Cluster([bib]) for bib in bibrecrefs] new_cluster_set.last_name = name cluster_sets.append(new_cluster_set) return cluster_sets
def tortoise_last_name(name, pure=False): lname = generate_last_name_cluster_str(name) names = create_lastname_list_from_personid() names = filter(lambda x: x[0] == name, names) if names: pids = names[0][1] bibauthor_print("Found %s(%s), %d pids" % (name, lname, len(pids))) disambiguate_last_name(pids, lname, pure, False) else: bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
def create_lastname_list_from_personid(): ''' This function generates a dictionary from a last name to list of personids which have this lastname. ''' # ((personid, [full Name1], Nbibs) ... ) all_names = get_all_names_from_personid() # ((personid, last_name, Nbibs) ... ) all_names = ((row[0], generate_last_name_cluster_str(iter(row[1]).next()), row[2]) for row in all_names) # { (last_name, [(personid)... ], Nbibs) ... } all_names = groupby(sorted(all_names, key=itemgetter(1)), key=itemgetter(1)) all_names = ((key, list(data)) for key, data in all_names) all_names = ((key, map(itemgetter(0), data), sum(x[2] for x in data)) for key, data in all_names) return all_names