def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p: -2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict( (sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) personid = get_paper_to_author_and_status_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): logger.update_status( float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict( (pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) logger.update_status_final("Merging done.") logger.update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p:-2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict((sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) personid = get_paper_to_author_and_status_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict((pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) update_status_final("Merging done.") update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() current_mapping = get_paper_to_author_and_status_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid or rejected: return assert len(assigned) == 1 conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): logger.update_status( float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map( itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) logger.update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() current_mapping = get_paper_to_author_and_status_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map(itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) update_status_final() remove_empty_authors() update_canonical_names_of_authors()