def check_paper(self): if bconfig.TABLES_UTILS_DEBUG: print " -> processing paper = %s" % (self.paper[0],) bibrefs100 = dbinter.get_authors_from_paper(self.paper[0]) bibrefs700 = dbinter.get_coauthors_from_paper(self.paper[0]) bibrecreflist = frozenset( ["100:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs100] + ["700:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs700] ) pid_rows_lazy = None # finally, if a bibrec/ref pair is in the authornames table but not in this list that name of that paper # is no longer existing and must be removed from the table. The new one will be addedd by the # update procedure in future; this entry will be risky becouse the garbage collector may # decide to kill the bibref in the bibX0x table for row in self.paper[1]: if row[3] not in bibrecreflist: if not pid_rows_lazy: pid_rows_lazy = dbinter.collect_personid_papers(paper=(self.paper[0],), person=personid_q) other_bibrefs = [b[0] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] dbinter.delete_personid_by_id(int(row[0])) if bconfig.TABLES_UTILS_DEBUG: print "* deleting record with missing bibref: \ id = %s, personid = %s, tag = %s, data = %s, flag = %s, lcul = %s" % row print "found %d other records with the same personid and bibrec" % len(other_bibrefs) if len(other_bibrefs) == 1: # we have one and only one sobstitute, we can switch them! dbinter.update_flags_in_personid(row[4], row[5], other_bibrefs[0]) if bconfig.TABLES_UTILS_DEBUG: print "updating id=%d with flag=%d,lcul=%d" % (other_bibrefs[0], row[4], row[5]) persons_to_update = set([(p[1],) for p in self.paper[1]]) dbinter.update_personID_canonical_names(persons_to_update) dbinter.update_personID_names_string_set(persons_to_update, single_threaded=True, wait_finished=True) close_connection()
def check_paper(self): if bconfig.TABLES_UTILS_DEBUG: print " -> processing paper = %s" % (self.paper[0],) bibrefs100 = dbinter.get_authors_from_paper(self.paper[0]) bibrefs700 = dbinter.get_coauthors_from_paper(self.paper[0]) bibrecreflist = frozenset(["100:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs100] + ["700:%s,%s" % (str(i[0]), self.paper[0]) for i in bibrefs700]) pid_rows_lazy = None #finally, if a bibrec/ref pair is in the authornames table but not in this list that name of that paper #is no longer existing and must be removed from the table. The new one will be addedd by the #update procedure in future; this entry will be risky becouse the garbage collector may #decide to kill the bibref in the bibX0x table for row in self.paper[1]: if row[3] not in bibrecreflist: if not pid_rows_lazy: pid_rows_lazy = dbinter.collect_personid_papers(paper=(self.paper[0],), person=personid_q) other_bibrefs = [b[0] for b in pid_rows_lazy if b[1] == row[1] and b[3] != row[3]] dbinter.delete_personid_by_id(int(row[0])) if bconfig.TABLES_UTILS_DEBUG: print "* deleting record with missing bibref: \ id = %s, personid = %s, tag = %s, data = %s, flag = %s, lcul = %s" % row print "found %d other records with the same personid and bibrec" % len(other_bibrefs) if len(other_bibrefs) == 1: #we have one and only one sobstitute, we can switch them! dbinter.update_flags_in_personid(row[4], row[5], other_bibrefs[0]) if bconfig.TABLES_UTILS_DEBUG: print "updating id=%d with flag=%d,lcul=%d" % (other_bibrefs[0], row[4], row[5]) persons_to_update = set([(p[1],) for p in self.paper[1]]) dbinter.update_personID_canonical_names(persons_to_update) dbinter.update_personID_names_string_set(persons_to_update, single_threaded=True, wait_finished=True) close_connection()
deleted_recs = dbinter.get_deleted_papers() deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None counter = 0 rows_limit = 10000000 end_loop = False while not end_loop: task_sleep_now_if_required(True) papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit,)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs: to_remove.add(p[1][0])
deleted_recs = dbinter.get_deleted_papers() deleted_recs = frozenset(x[0] for x in deleted_recs) if bconfig.TABLES_UTILS_DEBUG: print "%d total deleted papers" % (len(deleted_recs),) if personid: personid_q = dbinter.list_2_SQL_str(personid, lambda x: str(x[0])) else: personid_q = None counter = 0 rows_limit = 10000000 end_loop = False while not end_loop: task_sleep_now_if_required(can_stop_too=False) papers_data = dbinter.collect_personid_papers(person=personid_q, limit=(counter, rows_limit,)) if bconfig.TABLES_UTILS_DEBUG: print "query with limit %d %d" % (counter, rows_limit) if len(papers_data) == rows_limit: counter += rows_limit else: end_loop = True papers_data = tuple((extract_bibrec(p[3]), p) for p in papers_data) to_remove = set() jobs = dict() for p in papers_data: if int(p[0]) in deleted_recs: to_remove.add(p[1][0])