def run(self): try: data_run_out = False while not data_run_out: if self.idList is None or len(self.idList) == 0: walker = PersonWalkThroughOrderByPubcount(self.extractor.generation, processer=self.person_processer, fetch_size=100, fix_person_ext=True) else: walker = PersonWalkThroughByGivenIDList(self.extractor.generation, processer=self.person_processer, pids=self.idList, fix_person_ext=True); walker.walk() if self.personUpdater.isAllFinished(self.extractor.generation) or self.settings.byid: print "All data finished. Ended Provider." data_run_out = True print "All person walked, reload all" time.sleep(10) # reach here if all persons loaded self.extractor.waiting_to_finish = True print "$mgr/provider:> All person added to Queue, waiting for stop." except Exception, e: ExceptionHelper.print_exec(e) print '-' * 100 print 'BIG Exception, and can\'t continue.' print '-' * 100 sys.exit()
def walk(self): ''' Walk through all persons in database. ''' """ this place put while true inner """ while True: try: lastId = -1 #former while true here # while True: conn = DB.pool().getConnection() cursor = conn.cursor() #print self.sql % (self.gen, start_id, self.limit) cursor.execute(self.sql, (self.update_generation, lastId, self.fetch_size)) data = cursor.fetchall() print "&[Walker]> walk through na_person, %s items" % cursor.rowcount if cursor.rowcount == 0: break id = 0 for id, names, pubcount in data: # fix if self.fix_person_ext and pubcount is None: self.person_update_tool.insertPersonExt(id, self.update_generation, pubcount) lastId = id namelist = names.split(",") for name in namelist: name = name.strip() # Call callback self.processer(Person(id, namelist, pubcount)) cursor.close() conn.close() except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e) # raise return data
def walk(self): ''' Walkthrough all persons in db. ''' while True: try: lastId = -1 # while True: conn = DB.pool().getConnection() cursor = conn.cursor() #print self.sql % (self.gen, start_id, self.limit) cursor.execute(self.sql, (self.gen, lastId, self.fetch_size)) data = cursor.fetchall() print "&-walker-:> walk person(citation) %s items" % cursor.rowcount if cursor.rowcount == 0: break id = 0 for id, fullname, pubcount in data: lastId = id person = Person(id, fullname, pubcount) self.processer(person) cursor.close() conn.close() # sleep 10 minutes next loop time.sleep(20) except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e) # raise return data
def execute(self, sql, *args): try: conn = DB.pool().getConnection(); cursor = conn.cursor() cursor.execute(sql, args) cursor.close() conn.close() return cursor.rowcount except Exception, e: ExceptionHelper.print_exec(e)
def getPersonTotalCount(self): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("select count(*) from na_person") row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: # @UndefinedVariable ExceptionHelper.print_exec(e)
def getTotalCount(self): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("select count(*) from publication") row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def insertPersonExt(self, person_id, update_generation=0, pubcount=0): while True: try: conn = DB.pool().getConnection(); cursor = conn.cursor() cursor.execute("insert into person_update_ext(id,u_citation_gen,pubcount) values(%s,%s,%s)", (person_id, update_generation, 0)) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e)
def getMinGenerationInDB(self): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("select min(u_citation_gen) from publication") row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def getPersonTotalCount(self): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("select count(*) from na_person") row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e)
def setGoogleValue(self, id, google_value, links=''): sql = 'update GoogleResult_mark set google = %s, url = %s where id = %s limit 1' if links is None: linkstr = '' else: linkstr = '\n'.join(links) try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute(sql, (google_value, linkstr, id,)) except Exception, e: ExceptionHelper.print_exec(e)
def resetPersonPublicationUpdateGen(self, personId): ''' set publication's update_gen to 0 of one person''' while True: try: sql = '''update publication p left join na_author2pub a2p on p.id = a2p.pid set p.u_citation_gen=0 where a2p.aid=%s ''' conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute(sql, (personId)) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e)
def resetPersonPublicationUpdateGen(self, personId): """ set publication's update_gen to 0 of one person""" while True: try: sql = """update publication p left join na_author2pub a2p on p.id = a2p.pid set p.u_citation_gen=0 where a2p.aid=%s """ conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute(sql, (personId)) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: # @UndefinedVariable ExceptionHelper.print_exec(e)
def getLeftCount(self, generation): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( "select count(*) from publication p \ where p.u_citation_gen is null or p.u_citation_gen < %s", generation) row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def markPersonUpdateCitationFinished(self, personId, gen): while True: data = None try: conn = DB.pool().getConnection() cursor = conn.cursor() data = cursor.execute("update person_update_ext set u_citation_gen=%s where id=%s;", (gen, personId)) cursor.close() conn.close() return cursor.rowcount except Exception, e: ExceptionHelper.print_exec(e) # raise return data
def savePerson(self, personId, personName): """Save author2pub""" while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() data = cursor.execute("insert into person (id, fullname) values (%s,%s)", (personId, personName)) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: ExceptionHelper.print_exec(e) # raise return data
def getPersonLeftCount(self, generation): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( "select count(*) from na_person p left join person_update_ext pe on p.id=pe.id \ where pe.u_citation_gen is null or pe.u_citation_gen < %s", generation) row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def popTitlesFromDB(self): sql = 'select id,titleA,titleB from GoogleResult_mark where google is null and updatetime is null order by id limit 1' try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute(sql) data = cursor.fetchall() if(data and len(data) == 1): (id, title1, title2) = data[0] updateSQL = 'update GoogleResult_mark set updatetime = CURRENT_TIMESTAMP where id = %s' cursor.execute(updateSQL, (id,)) return id, title1, title2 except Exception, e: ExceptionHelper.print_exec(e)
def save(self, person): '''Save author2pub''' while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() data = cursor.execute( "insert into person (id, fullname) values (%s,%s)", (person.id, person.name)) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: ExceptionHelper.print_exec(e) # raise return data
def markPersonUpdateCitationFinished(self, personId, gen): while True: data = None try: conn = DB.pool().getConnection() cursor = conn.cursor() data = cursor.execute( "update person_update_ext set u_citation_gen=%s where id=%s;", (gen, personId)) cursor.close() conn.close() return cursor.rowcount except Exception, e: ExceptionHelper.print_exec(e) # raise return data
def getLeftCount(self, generation): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( "select count(*) from publication p \ where p.u_citation_gen is null or p.u_citation_gen < %s", generation, ) row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def getPersonLeftCount(self, generation): while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( "select count(*) from na_person p left join person_update_ext pe on p.id=pe.id \ where pe.u_citation_gen is null or pe.u_citation_gen < %s", generation, ) row = cursor.fetchone() cursor.close() conn.close() return row[0] except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def save(self, aid, pid, position): """Save author2pub""" while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() data = cursor.execute( "insert into author2pub (aid,pid,position) values(%s,%s,%s)", (aid, pid, position) ) cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: ExceptionHelper.print_exec(e) # raise return data
def putToPubCache(self, person, pub): """将pub放到缓存等待再搜一遍""" if person is None: return try: if pub is not None and pub.id not in self.pubmap: with self.pub_lock: self.pubmap[pub.id] = pub # print "[store](putToPubCache):add pub(%s, [%s]) to pubmap, now length %s, with person(%s)" % (pub.id, pub.ncitation, len(self.pubmap), person.id) if person.id not in self.person_pub_map: self.person_pub_map[person.id] = [] # print "[store](putToPubCache):add person(%s) to person_pub_map, now length %s " % (person.id, len(self.person_pub_map)) person_pub_list = self.person_pub_map[person.id] person_pub_list.append(pub.id) except Exception, e: ExceptionHelper.print_exec(e)
def walk(self): ''' Get by pids. ''' while True: try: print "&[Walker]> walk through na_person, BY_ID_LIST: %s items" % len(self.pids) page = 0 count = 300 data = [] conn = DB.pool().getConnection() while True: cursor = conn.cursor() print "Getting People of Page %s" % page # sql temp = [] for item in self.pids[page*count:(page+1)*count]: temp.append(str(item)) if page*count > len(self.pids): break if len(temp) == 0: break inwhere = "".join(["(", ",".join(temp) , ")"]) self.sql = '''select p.id, p.names, pe.id, pe.pubcount from na_person p left join person_update_ext pe on p.id=pe.id \ where (pe.u_citation_gen is null or pe.u_citation_gen < %s) and p.id in %s ''' % (self.update_generation, inwhere) cursor.execute(self.sql) data.extend(cursor.fetchall()) page += 1 if cursor.rowcount == 0: break time.sleep(1) for pid, names, peid, pubcount in data: # fix if self.fix_person_ext and peid is None: self.person_update_tool.insertPersonExt(pid, self.update_generation, pubcount) namelist = names.split(",") for name in namelist: name = name.strip() # Call callback self.processer(Person(pid, namelist, pubcount)) cursor.close() conn.close() except MySQLdb.Error, e: #@UndefinedVariable ExceptionHelper.print_exec(e) return data
def flushDBCache(self): ''' 将内存中缓存的已经抓取的pub批量存入数据库中。 ''' try: if len(self.pub_db_cache) == 0: return print 'start flush to db.' temp = [] # pubs[] for i in range(0, len(self.pub_db_cache)): #@UnusedVariable (k, v) = self.pub_db_cache.popitem() #@UnusedVariable if v: temp.append(v) print "%s updated to db." % v if len(temp) > 0: self.pub_dao.batchUpdate(self.gen, temp) print 'end flush to db.' # @todo: flush db cache except Exception, e: ExceptionHelper.print_exec(e)
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % ( self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % ( len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
def batchUpdate(self, gen, pubs): """batch update pubs, params is ((ncitation, gen, id),...)""" params = [] for pub in pubs: params.append((pub.ncitation, gen, pub.id)) while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("Set AUTOCOMMIT = 0") data = cursor.executemany("update publication set ncitation=%s, u_citation_gen=%s where id=%s", params) conn.commit() cursor.execute("Set AUTOCOMMIT = 1") cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: ExceptionHelper.print_exec(e) # raise return data
def get_author(self, aid, generation): print aid, generation self.sql = """select p.id, p.names, pe.pubcount from na_person p left join person_update_ext pe on p.id=pe.id where (pe.u_citation_gen is null or pe.u_citation_gen < %s) and p.id=%s """ % (generation, aid) try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute(self.sql) data = cursor.fetchall() if cursor.rowcount == 0: return for aid, names, pubcount in data: namelist = [name.strip() for name in names.split(",")] print 'get author: ', namelist return Person(aid, namelist, pubcount) cursor.close() conn.close() except MySQLdb.Error, e: ExceptionHelper.print_exec(e)
def getPublicationByPerson(self, personId, gen): """Get all publications of a person""" while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( """select p.id, p.year, p.title, p.pubkey, p.jconf, p.authors, p.ncitation, p.u_citation_gen from publication p left join na_author2pub a2p on p.id=a2p.pid where (p.u_citation_gen < %s or p.u_citation_gen is null) and a2p.aid=%s""", (gen, personId), ) data = cursor.fetchall() pubs = [] for id, year, title, pubkey, jconf, authors, ncitation, gen in data: pub = Publication(id, year, title, pubkey, jconf, authors, ncitation) pubs.append(pub) cursor.close() conn.close() return pubs except Exception, e: ExceptionHelper.print_exec(e)
def getPublicationByPerson(self, personId, gen): '''Get all publications of a person''' while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute( '''select p.id, p.year, p.title, p.pubkey, p.jconf, p.authors, p.ncitation, p.u_citation_gen from publication p left join na_author2pub a2p on p.id=a2p.pid where (p.u_citation_gen < %s or p.u_citation_gen is null) and a2p.aid=%s''', (gen, personId)) data = cursor.fetchall() pubs = [] for id, year, title, pubkey, jconf, authors, ncitation, gen in data: pub = Publication(id, year, title, pubkey, jconf, authors, ncitation) pubs.append(pub) cursor.close() conn.close() return pubs except Exception, e: ExceptionHelper.print_exec(e)
def runOriginal(self): while self.extractor.running and not self.ask_to_stop: self.mark() self.extractor.wait_for_pause() # wait if paused # url, url_without_author, pubs_in_url = store.getFromPubQueue() # get url and pubs query, used_pubs = self.store.getFromPubQueue() # get url and pubs if used_pubs is None or len(used_pubs) == 0: print "[ERROR][t_pub_process:%s] Queue is Empty.(%s,%s)" % (self.name, query, used_pubs) time.sleep(10) continue self.extractor.wait_for_pause() # wait again with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_pub_semaphore += 1 pubs_found = None pubs_notfound = None try: all_models = Extractor.getInstance().getNodesByPubs(used_pubs) if all_models is not None: (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) if pubs_found is None or pubs_notfound is None: print '[ERROR][-/-] some pubs, pubs_found is None or pubs_notfound is None, return' return print "{+P}[%s/%s] [found/notfound] pub, query[%s]." % (len(pubs_found), len(pubs_notfound), query) else: pubs_notfound = used_pubs except Exception, e: ExceptionHelper.print_exec(e) print '-------------------------------------------------------' print 'query:', query print 'all_models', all_models print 'used_pubs', used_pubs print '-------------------------------------------------------' return finally:
def run(self): while self.extractor.running and not self.ask_to_stop: # acquire person self.mark() self.person = None self.extractor.wait_for_pause() # wait if paused self.person = self.store.getFromPersonQueue(timeout=self.extractor.mgr_interval) self.extractor.wait_for_pause() # wait again if self.person is not None: # process person. with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_person_semaphore += 1 try: self.process_person() except Exception, e: ExceptionHelper.print_exec(e) # raise if self.person is not None: print "[ERROR] < pub back person %s:" % self.person self.store.addToPersonQueue(self.person) finally: with self.extractor.busy_semaphore_lock:
def batchUpdate(self, gen, pubs): '''batch update pubs, params is ((ncitation, gen, id),...)''' params = [] for pub in pubs: params.append((pub.ncitation, gen, pub.id)) while True: try: conn = DB.pool().getConnection() cursor = conn.cursor() cursor.execute("Set AUTOCOMMIT = 0") data = cursor.executemany( "update publication set ncitation=%s, u_citation_gen=%s where id=%s", params) conn.commit() cursor.execute("Set AUTOCOMMIT = 1") cursor.close() conn.close() return cursor.rowcount except MySQLdb.Error, e: ExceptionHelper.print_exec(e) # raise return data
def run(self): while self.extractor.running and not self.ask_to_stop: # acquire person self.mark() self.person = None self.extractor.wait_for_pause() # wait if paused self.person = self.store.getFromPersonQueue( timeout=self.extractor.mgr_interval) self.extractor.wait_for_pause() # wait again if self.person is not None: # process person. with self.extractor.busy_semaphore_lock: self.extractor.busy_semaphore += 1 self.extractor.busy_person_semaphore += 1 try: self.process_person() except Exception, e: ExceptionHelper.print_exec(e) # raise if self.person is not None: print "[ERROR] < pub back person %s:" % self.person self.store.addToPersonQueue(self.person) finally: with self.extractor.busy_semaphore_lock:
def getFromPubQueue(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合 (取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = False try: while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub: pub_candidates.append(_pub) valid_ids += 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: for pubId in self.person_pub_map[personId]: if pubId in self.pubmap: # print "[store](getFromPubQueue):delete pub(%s,[%s]) from pubmap, cause person(%s) " % (pubId, self.pubmap[pubId].ncitation, personId) del self.pubmap[pubId] del self.person_pub_map[personId] # print "[store](getFromPubQueue):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[store] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](getFromPubQueue):delete pub(%s, [%s]) from pubmap, now length %s " % (pub.id, pub.ncitation, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)
def getFromPubQueueBack(self): ''' 从Store中的零散Pub中取下一个要抓取的pub组合,(取几个pub拼成一个最长字符串用来抓取) 如果遇到错误,可能返回None. @return: (url, pubs[]) ''' print_verbose = True try: # block if no pub items. start = time.time() while self.running and len(self.person_pub_map) == 0: time.sleep(self.mgr_interval) dur = (time.time() - start) #print "---------============----------- get 1 wait %.4s" % dur if print_verbose: print('TimeUsed:%.4s ms, ' % dur) start = time.time() self.blocked_pub_t += 1 with self.pub_lock: # lock self.blocked_pub_t -= 1 # count self.ppt_wait += dur #print "---------============----------- get 3 getlock %.4s" % (time.time() - start) self.ppt_getlock += (time.time() - start) start = time.time() # select candidates pub_candidates = [] # {pubId -> pub_with_person_name}, candidates person_invalid = [] # mark person that not valid, delete later for personId, ids in self.person_pub_map.iteritems(): # if person with no ids, del this person. if ids is None or len(ids) == 0: person_invalid.append(personId) else: valid_ids = 0 for pubId in ids: if print_verbose: print('\tcandidate pub %s' % pubId) if pubId in self.pubmap: _pub = self.pubmap[pubId] if _pub is not None: pub_candidates.append(_pub) valid_ids = valid_ids + 1 if print_verbose: print('\tcandidate pub %s of person %s.' % (_pub.title, personId)) if len(pub_candidates) > 0: # enough if print_verbose: print('\tcandidates enough, length %s ' % len(pub_candidates)) break if valid_ids == 0: # means all pub of this person is not valid. just delete this person. person_invalid.append(personId) for personId in person_invalid: del self.person_pub_map[personId] # print "[store](line 123):delete person(%s) from person_pub_map, now length %s " % (personId, len(self.person_pub_map)) # return None if not available if pub_candidates is None or len(pub_candidates) == 0: print('\t[ERR] Cannot be here. empty candidates. return null.') return None, None # gen query query, used_pubs, nouse_pubs = Extractor.pinMaxQuery(pub_candidates[:1]) for pub in used_pubs: del self.pubmap[pub.id] # delete pub. # print "[store](line 134):delete pub(%s) from pubmap, now length %s " % (pub.id, len(self.pubmap)) # Save nouse_pubs to dbcache, waiting to write to db. nouse_pubs += pub_candidates[1:] if nouse_pubs: for pub in nouse_pubs: self.putToPubdbcache(pub); return query, used_pubs except Exception, e: ExceptionHelper.print_exec(e) print ('Exception occurred: %s. ' % e)