def check_paper(self): if bconfig.TABLES_UTILS_DEBUG: print " -> processing paper = %s" % (self.paper[0],) bibrefs100 = dbinter.get_authors_from_paper(self.paper[0]) bibrefs700 = dbinter.get_coauthors_from_paper(self.paper[0]) bibrecreflist = frozenset( ["100:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs100] + ["700:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs700] ) pid_rows_lazy = None # finally, if a bibrec/ref pair is in the authornames table but not in this list that name of that paper # is no longer existing and must be removed from the table. The new one will be addedd by the # update procedure in future; this entry will be risky becouse the garbage collector may # decide to kill the bibref in the bibX0x table for row in self.paper[1]: if row[3] not in bibrecreflist: if not pid_rows_lazy: pid_rows_lazy = dbinter.collect_personid_papers(paper=(self.paper[0],), person=personid_q) other_bibrefs = [b[0] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] dbinter.delete_personid_by_id(int(row[0])) if bconfig.TABLES_UTILS_DEBUG: print "* deleting record with missing bibref: \ id = %s, personid = %s, tag = %s, data = %s, flag = %s, lcul = %s" % row print "found %d other records with the same personid and bibrec" % len(other_bibrefs) if len(other_bibrefs) == 1: # we have one and only one sobstitute, we can switch them! dbinter.update_flags_in_personid(row[4], row[5], other_bibrefs[0]) if bconfig.TABLES_UTILS_DEBUG: print "updating id=%d with flag=%d,lcul=%d" % (other_bibrefs[0], row[4], row[5]) persons_to_update = set([(p[1],) for p in self.paper[1]]) dbinter.update_personID_canonical_names(persons_to_update) dbinter.update_personID_names_string_set(persons_to_update, single_threaded=True, wait_finished=True) close_connection()
def personid_remove_automatically_assigned_papers(pids=None): ''' Part of the person repair facility. Removes every person entity that has no prior human interaction. Will run on all person entities if pids == None @param pids: List of tuples of person IDs @type pids: list of tuples ''' if not pids: pids = get_all_person_ids() for pid in pids: tickets = get_person_rt_tickets(pid[0]) pclaims = get_person_claimed_papers(pid[0]) nclaims = get_person_rejected_papers(pid[0]) if len(tickets) > 0 and len(pclaims) == 0 and len(nclaims) == 0: continue elif len(tickets) == 0 and len(pclaims) == 0 and len(nclaims) == 0: delete_personid_by_id(pid[0]) elif len(pclaims) > 0: del_person_not_manually_claimed_papers(pid) elif len(nclaims) > 0: continue
def check_paper(self): if bconfig.TABLES_UTILS_DEBUG: print " -> processing paper = %s" % (self.paper[0],) bibrefs100 = dbinter.get_authors_from_paper(self.paper[0]) bibrefs700 = dbinter.get_coauthors_from_paper(self.paper[0]) bibrecreflist = frozenset(["100:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs100] + ["700:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs700]) pid_rows_lazy = None #finally, if a bibrec/ref pair is in the authornames table but not in this list that name of that paper #is no longer existing and must be removed from the table. The new one will be addedd by the #update procedure in future; this entry will be risky becouse the garbage collector may #decide to kill the bibref in the bibX0x table for row in self.paper[1]: if row[3] not in bibrecreflist: if not pid_rows_lazy: pid_rows_lazy = dbinter.collect_personid_papers(paper=(self.paper[0],), person=personid_q) other_bibrefs = [b[0] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] dbinter.delete_personid_by_id(int(row[0])) if bconfig.TABLES_UTILS_DEBUG: print "* deleting record with missing bibref: \ id = %s, personid = %s, tag = %s, data = %s, flag = %s, lcul = %s" % row print "found %d other records with the same personid and bibrec" % len(other_bibrefs) if len(other_bibrefs) == 1: #we have one and only one sobstitute, we can switch them! dbinter.update_flags_in_personid(row[4], row[5], other_bibrefs[0]) if bconfig.TABLES_UTILS_DEBUG: print "updating id=%d with flag=%d,lcul=%d" % (other_bibrefs[0], row[4], row[5]) persons_to_update = set([(p[1],) for p in self.paper[1]]) dbinter.update_personID_canonical_names(persons_to_update) dbinter.update_personID_names_string_set(persons_to_update, single_threaded=True, wait_finished=True) close_connection()
else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs: to_remove.add(p[1][0]) elif not papers_list or int(p[0]) in papers_list: jobs[p[0]] = jobs.get(p[0], []) + [p[1]] del(papers_data) if len(to_remove) > 0: task_sleep_now_if_required(True) delta = dbinter.delete_personid_by_id(to_remove) counter -= delta if bconfig.TABLES_UTILS_DEBUG: print "* deleting %d papers, from %d, marked as deleted" % (delta, len(to_remove)) jobslist = Queue() for p in jobs.items(): jobslist.put(p) del(jobs) max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS while not jobslist.empty(): workers = [] checker = status_checker() for i in range(max_processes): w = Worker(jobslist, checker)
else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs: to_remove.add(p[1][0]) elif not papers_list or int(p[0]) in papers_list: jobs[p[0]] = jobs.get(p[0], []) + [p[1]] del(papers_data) if len(to_remove) > 0: task_sleep_now_if_required(can_stop_too=False) delta = dbinter.delete_personid_by_id(to_remove) counter -= delta if bconfig.TABLES_UTILS_DEBUG: print "* deleting %d papers, from %d, marked as deleted" % (delta, len(to_remove)) jobslist = Queue() for p in jobs.items(): jobslist.put(p) del(jobs) max_processes = bconfig.CFG_BIBAUTHORID_PERSONID_SQL_MAX_THREADS while not jobslist.empty(): workers = [] checker = status_checker() for i in range(max_processes): w = Worker(jobslist, checker)