def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap( all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched
def test_matchPub(self): self.extractor = Extractor().getInstance() pubdao = PublicationDao() person_id = 13419 person_name = 'jie tang' # Read sources from files all_models = {} for page in range(0, 3): filename = "".join((person_name, '_page_', str(page), '.html')) f = file(os.path.join(self.settings.source_dir, filename), 'r') html = f.read() models = self.extractor.extract_from_source(html) if models is not None: self.extractor._Extractor__merge_into_extractedmap(all_models, models) print 'Total found DEBUG %s items.' % len(all_models) # part 2 pubs = pubdao.getPublicationByPerson(person_id, self.settings.generation) printout = False if printout: for key, models in all_models.items(): print key, " --> ", models print '===================' for pub in pubs: print pub (pubs_matched, pubs_not_matched) = self.matchPub(pubs, all_models) print '- test done -', len(pubs_matched), len(pubs_not_matched) return pubs_not_matched
def test_getpublications(self): '''Test get all publications from database.''' print '-TEST-:', TestCase.test_getpublications.__doc__ pubdao = PublicationDao() pubs = pubdao.getPublicationByPerson(13423, self.settings.generation) # id for jie tang, current generation for pub in pubs: print pub print '-END TEST-'
def __init__(self, generation, mgr_interval=5): self.settings = Settings.getInstance() self.debug = self.settings.debug self.gen = generation self.mgr_interval = mgr_interval self.person_queue = Queue.Queue(maxsize=self.settings.person_cache_size) self.person_id_set = set([]) # sync with queue, quick contains using id. self.pubmap = {} # {id -> pub} self.person_pub_map = {} # {person_id->[pub_id_list]} - person to pub_ids self.pub_db_cache = {} self.pub_lock = threading.Lock() self.pub_dbcache_lock = threading.RLock() self.running = True #sync ed with main running flag in mgr_interval_thread self.blocked_pub_t = 0 # time sum self.ppt_wait = 0 self.ppt_getlock = 0 self.ppt_get = 0 self.person_dao = PersonDao() self.pub_dao = PublicationDao()
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [ ] #= Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
def __init__(self, extractorInstance): threading.Thread.__init__(self) self.extractor = extractorInstance self.store = self.extractor.store self.pubdao = PublicationDao() self.person = None # set this and start. self.ask_to_stop = False self.last_action = datetime.datetime.now()
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [] # = Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] # = Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
class DebugSuit(): def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao() def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- END DEBUG -' def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append(Publication(-1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap(None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -'
def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao()
class GoogleScholarExtractor: '''Author gb<elivoa[AT]gmail.com> v0.4.0''' def __init__(self): print "Task: extract paper's citation from scholar.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug self.threadChildren = 0 self.PersonThreadActive = 0 self.PubThreadActive = 0 self.mgr_interval = 8 # seconds self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [] #= Queue.Queue(maxsize=self.settings.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.settings.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 self.busy_person_semaphore = 0 self.busy_pub_semaphore = 0 self.store = None self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() self.determineGereration(); # determine if the program could run or wait, or load continue status. def determineGereration(self): self.generation = self.dao.getGeneration() currentMinGen = self.dao.getMinGenerationInDB() currentMaxGen = self.dao.getMaxGenerationInDB() print '=====================================================================' print " * Required update_generation is: [ %s ]." % (self.generation) print " * Current min update_generation is: [ %s ]." % (currentMinGen) # process generation if currentMinGen < self.generation == currentMaxGen or self.generation > currentMaxGen: print " * Not finished task, continue to finish current generation." elif self.generation == currentMinGen == currentMaxGen: print " * Just start new generation"; self.generation = self.generation + 1 self.dao.setGeneration(self.generation) else: print "=== Error: generation(%s) bigger than currentMinGen(%s)" % (self.generation, currentMinGen) self.generation = currentMaxGen self.dao.setGeneration(self.generation) # count task progress print " * Process NA Persons : %s." % self.reportPersonProgress(self.generation) print " * Process Publication: %s." % self.reportPublicationProgress(self.generation) print '=====================================================================' def reportPersonProgress(self, udpate_generation): ''' Return String that report progress of person. ''' total = self.personDao.getPersonTotalCount() left = self.personDao.getPersonLeftCount(udpate_generation) progress = float(total - left) / total * 100.0 return "[%6.2f%%] %s/%s" % (progress, total - left, total) def reportPublicationProgress(self, udpate_generation): ''' Return String that report progress of person. ''' total = self.pubDao.getTotalCount() left = self.pubDao.getLeftCount(udpate_generation) progress = float(total - left) / total * 100.0 return "[%6.2f%%] %s/%s" % (progress, total - left, total) def start(self): '''Extract Citation Multithread - Start main threads... - Manager Threads - Person Provider Thread - Publication Download Thread - ... ''' self.store = Store(self.generation, self.mgr_interval) self.t_mgr = threading.Thread(target=self.mgrThreadBody, args=(), name='thread-mgr') # use method mgr. self.t_mgr.start() self.t_provider = ProviderThread(self, None) self.t_provider.start() # waiting to finish self.t_mgr.join() print "============ ALL END ============" def wait_for_pause(self): while self.pause: time.sleep(self.mgr_interval) # # Management Thread # def mgrThreadBody(self): "Management Thread" print "#init:> start mgr & provider." getter = HtmlRetriever.getInstance(self.settings.use_proxy) while self.running or not self.stopped: # interval seconds passed. interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds if interval_seconds == 0: interval_seconds = 1 self.last_report_time = datetime.datetime.now(); try: self.PersonThreadActive = 0 self.PubThreadActive = 0 for x in self.person_thread_pool: if x.check_idle(): self.PersonThreadActive += 1 for y in self.pub_thread_pool: if y.check_idle(): self.PubThreadActive += 1 except Exception: print "ERROR:count errer" print Exception try: # save pdf link if self.settings.save_pdflink: self.pdfcache.flush() except Exception: print "ERROR: pdf link" print Exception message = None # 什么时候重启所有线程&进程 reload_all_thread = False if self.num_report % 1000 == 0: reload_all_thread = True message = "Kill & Restart All Thread." try: # Maintain Threads and get worker threads status. (num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread=False) except Exception: print "ERROR: maintain threads and worker" print Exception try: # Finish Condition. if self._checkFinishCondition(): self.running = False # -> tell all threads finish. message = "MESSAGE! Send terminal signal to all worker thread." except Exception: print "ERROR: condition check" print Exception # if all worker threads stopped, mgrThread can stop. if num_persont_alive == 0 and num_pubt_alive == 0: self.stopped = True message = "Send terminal signal to mgr_thread." # check network and count period_success_connection = getter.success_connection_count - getter.last_success_connection_count period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count total_connections = period_success_connection + period_bad_connection getter.last_success_connection_count = getter.success_connection_count getter.last_bad_connection_count = getter.bad_connection_count average_success_persecond = period_success_connection / float(interval_seconds) average_bad_persecond = period_bad_connection / float(interval_seconds) if False: # 是否Block模式,就是暂停整个程序 if getter.detect_mode: if getter.detect_success_count > 3: getter.leave_detect_mode() self.detect_exit_wait = 1 # 刚出来时,下两轮都不要再进入block模式了。 else: if total_connections * 0.9 < period_bad_connection: if self.detect_exit_wait > 0: print "---- waiting %s rounds ----" % self.detect_exit_wait self.detect_exit_wait -= 1 else: getter.enter_detect_mode() ################ print interval string ################ try: # print report if not getter.detect_mode: str_report = None if not self.pause: self.num_report += 1 str_report = self.num_report else: str_report = "paused" report_strs = [] report_strs.append("-" * 100) report_strs.append("\n") report_strs.append("$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report)) report_strs.append("Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize())) report_strs.append("Pub(%sT on %s, %s items), " % (num_pubt_alive, len(self.store.pubmap), len(self.store.person_pub_map))) report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache)) report_strs.append("T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.settings.max_person_thread + self.settings.max_pub_thread - self.busy_semaphore)) report_strs += '\n' report_strs.append("Person(busy/idle)(%s/%s), Pub(busy/idle)(%s/%s)" % (self.busy_person_semaphore, self.settings.max_person_thread-self.busy_person_semaphore, self.busy_pub_semaphore, self.settings.max_pub_thread-self.busy_pub_semaphore)) g = getter.success_connection_count b = getter.bad_connection_count t = g + b rate = 0 if(t > 0): rate = g / float(t) report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b , t, rate)) report_strs.append("interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections)) report_strs.append("avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds)) report_strs.append("\n") report_strs.append("now have %s child threads, " % self.threadChildren) report_strs.append("active threads (%s person, %s pub) , " % (self.PersonThreadActive, self.PubThreadActive)) report_strs.append("\n") report_strs.append("time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get)) if message is not None: report_strs.append("\n") report_strs.append(message) report_strs.append("\n") report_strs.append(" * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation)) report_strs.append(" * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation)) report_strs.append("-" * 100) report_strs.append("\n") print "".join(report_strs) if (self.num_report%100 == 0): mr = MailReporter() mr.report(report_strs) except Exception: print "ERROR: report error" print Exception try: self.store.flushDBCache() # last flush cache to db. self.store.running = self.running # pass main running thread to Store object. except Exception: print "ERROR: flush db cache" print Exception time.sleep(self.mgr_interval) # interval print "$mgr:> exit." def _checkFinishCondition(self): '''@return: true if all can stop.''' # Finish Condition. if self.waiting_to_finish and not self.pause: # Provider report finish and not paused. if self.busy_semaphore == 0: # all threads' status must be idle. if self.store.person_queue.empty() \ and len(self.store.pubmap) == 0 \ and len(self.store.pub_db_cache) == 0: # task queue must be empty left = self.pubDao.getLeftCount(self.generation) if left == 0: # really finished. return True return False def _adjustThreadNum(self): # strength by period of day. hour = datetime.datetime.now().hour if hour <= 9: self.settings.max_person_thread = 25 self.settings.max_pub_thread = 75 elif 22 <= hour: self.settings.max_person_thread = 16 self.settings.max_pub_thread = 40 else: self.settings.max_person_thread = 22 self.settings.max_pub_thread = 60 def _maintainThreadPool(self, reload_all_thread): ''' Maintain ThreadPool, detect and restart, and set running threads on the fly. ''' num_persont_alive = 0 num_pubt_alive = 0 # if reload_all_thread: # kill all thread first. if False:#we don't want reload fucntion until fix the bug : pub thread idle for idx_pub_t in range(0, self.settings.max_pub_thread): t = None if len(self.pub_thread_pool) <= idx_pub_t: self.pub_thread_pool.append(t) else: t = self.pub_thread_pool[idx_pub_t] if t is not None: t.ask_to_stop = True self.pub_thread_pool = [] #fill thread pool while len(self.person_thread_pool)<self.settings.max_person_thread: self.person_thread_pool.append(None) while len(self.pub_thread_pool)<self.settings.max_pub_thread: self.pub_thread_pool.append(None) # check and start all unstarted threads. idx_person_t = 0 for idx_person_t in range(0, self.settings.max_person_thread): t = self.person_thread_pool[idx_person_t] if t is None: # if is None(new add) or dead. if self.running: t = PersonProcessThread(self) t.name = 'person-thread-' + str(idx_person_t) self.person_thread_pool[idx_person_t] = t t.start() with self.busy_semaphore_lock: self.threadChildren += 1 num_persont_alive += 1 elif not t.is_alive() or not t.check_idle(): if self.running: killedname = t.name t.stop() print "$mgr/thread:> kill thread %s" % killedname t = PersonProcessThread(self) t.name = 'person-thread-' + str(idx_person_t) self.person_thread_pool[idx_person_t] = t t.start() with self.busy_semaphore_lock: self.threadChildren += 1 num_persont_alive += 1 else: num_persont_alive += 1 # check and start all unstarted threads. idx_pub_t = 0 for idx_pub_t in range(0, self.settings.max_pub_thread): t = self.pub_thread_pool[idx_pub_t] if t is None: if self.running: t = PubProcessThread(self) t.name = 'pub-thread-' + str(idx_pub_t) self.pub_thread_pool[idx_pub_t] = t t.start() with self.busy_semaphore_lock: self.threadChildren += 1 num_pubt_alive += 1 elif not t.is_alive() or not t.check_idle(): if self.running: killedname = t.name t.stop() print "$mgr/thread:> kill thread %s" % killedname t = PubProcessThread(self) t.name = 'pub-thread-' + str(idx_pub_t) self.pub_thread_pool[idx_pub_t] = t t.start() with self.busy_semaphore_lock: self.threadChildren += 1 num_pubt_alive += 1 else: num_pubt_alive += 1 return (num_persont_alive, num_pubt_alive)
class GoogleScholarExtractor: '''Author gb<elivoa[AT]gmail.com> v0.4.0''' def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [ ] #= Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration() # determine if the program could run or wait, or load continue status. def determineGereration(self): self.generation = self.dao.getGeneration() currentMinGen = self.dao.getMinGenerationInDB() currentMaxGen = self.dao.getMaxGenerationInDB() print '=====================================================================' print " * Required update_generation is: [ %s ]." % (self.generation) print " * Current min update_generation is: [ %s ]." % (currentMinGen) # process generation if currentMinGen < self.generation == currentMaxGen or self.generation > currentMaxGen: print " * Not finished task, continue to finish current generation." elif self.generation == currentMinGen == currentMaxGen: print " * Just start new generation" self.generation = self.generation + 1 self.dao.setGeneration(self.generation) else: print "=== Error: generation(%s) bigger than currentMinGen(%s)" % ( self.generation, currentMinGen) self.generation = currentMaxGen self.dao.setGeneration(self.generation) # count task progress print " * Process NA Persons : %s." % self.reportPersonProgress( self.generation) print " * Process Publication: %s." % self.reportPublicationProgress( self.generation) print '=====================================================================' def reportPersonProgress(self, udpate_generation): ''' Return String that report progress of person. ''' total = self.personDao.getPersonTotalCount() left = self.personDao.getPersonLeftCount(udpate_generation) progress = float(total - left) / total * 100.0 return "[%6.2f%%] %s/%s" % (progress, total - left, total) def reportPublicationProgress(self, udpate_generation): ''' Return String that report progress of person. ''' total = self.pubDao.getTotalCount() left = self.pubDao.getLeftCount(udpate_generation) progress = float(total - left) / total * 100.0 return "[%6.2f%%] %s/%s" % (progress, total - left, total) def start(self): '''Extract Citation Multithread - Start main threads... - Manager Threads - Person Provider Thread - Publication Download Thread - ... ''' self.store = Store(self.generation, self.mgr_interval) self.t_mgr = threading.Thread(target=self.mgrThreadBody, args=(), name='thread-mgr') # use method mgr. self.t_mgr.start() self.t_provider = ProviderThread(self, None) self.t_provider.start() # waiting to finish self.t_mgr.join() print "============ ALL END ============" def wait_for_pause(self): while self.pause: time.sleep(self.mgr_interval) # # Management Thread # def mgrThreadBody(self): '''Management Thread ''' print "$init:> start mgr & provider." getter = HtmlRetriever.getInstance(self.settings.use_proxy) while self.running or not self.stopped: # interval seconds passed. interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds if interval_seconds == 0: interval_seconds = 1 self.last_report_time = datetime.datetime.now() # -------------------------------------------------------- # strength by period of day. hour = datetime.datetime.now().hour if hour <= 9: # 12h-9h self.max_person_thread = 25 self.max_pub_thread = 75 elif 22 <= hour: # 9h-22h self.max_person_thread = 16 self.max_pub_thread = 40 else: # 22h-24h self.max_person_thread = 22 self.max_pub_thread = 60 self.max_person_thread = 2 self.max_pub_thread = 2 # -------------------------------------------------------- try: # save pdf link if self.settings.save_pdflink: self.pdfcache.flush() except e: print "ERROR: pdf link" print e # message message = None # 什么时候重启所有线程&进程。 reload_all_thread = False if self.num_report % 1000 == 0: reload_all_thread = True message = "Kill & Restart All Thread." try: # Maintain Threads and get worker threads status. (num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread) except e: print "ERROR: maintain threads and worker" print e try: # Finish Condition. if self._checkFinishCondition(): self.running = False # -> tell all threads finish. message = "MESSAGE! Send terminal signal to all worker thread." except e: print "ERROR: condition check" print e # if all worker threads stopped, mgrThread can stop. if num_persont_alive == 0 and num_pubt_alive == 0: self.stopped = True message = "Send terminal signal to mgr_thread." # check network and count period_success_connection = getter.success_connection_count - getter.last_success_connection_count period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count total_connections = period_success_connection + period_bad_connection getter.last_success_connection_count = getter.success_connection_count getter.last_bad_connection_count = getter.bad_connection_count average_success_persecond = period_success_connection / float( interval_seconds) average_bad_persecond = period_bad_connection / float( interval_seconds) if False: # 是否Block模式,就是暂停整个程序 if getter.detect_mode: if getter.detect_success_count > 3: getter.leave_detect_mode() self.detect_exit_wait = 1 # 刚出来时,下两轮都不要再进入block模式了。 else: if total_connections * 0.9 < period_bad_connection: if self.detect_exit_wait > 0: print "---- waiting %s rounds ----" % self.detect_exit_wait self.detect_exit_wait -= 1 else: getter.enter_detect_mode() try: # print report if not getter.detect_mode: str_report = None if not self.pause: self.num_report += 1 str_report = self.num_report else: str_report = "paused" #-------------------------------------------------------------------------------- # print interval string. report_strs = [] report_strs.append("-" * 100) report_strs.append("\n") report_strs.append( "$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report)) report_strs.append( "Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize())) report_strs.append( "Pub(%sT on %s), " % (num_pubt_alive, len(self.store.pubmap))) report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache)) report_strs.append( "T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.max_person_thread + self.max_pub_thread - self.busy_semaphore)) report_strs.append("\n") g = getter.success_connection_count b = getter.bad_connection_count t = g + b rate = 0 if (t > 0): rate = g / float(t) report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b, t, rate)) report_strs.append( "interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections)) report_strs.append( "avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds)) report_strs.append("\n") report_strs.append( "time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get)) if message is not None: report_strs.append("\n") report_strs.append(message) report_strs.append("\n") report_strs.append( " * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation)) report_strs.append( " * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation)) report_strs.append("-" * 100) report_strs.append("\n") print "".join(report_strs) #-------------------------------------------------------------------------------- except e: print "ERROR: report error" print e try: # flush db cache self.store.flushDBCache() # last flush cache to db. self.store.running = self.running # pass main running thread to Store object. except e: print "ERROR: flush db cache" print e time.sleep(self.mgr_interval) # interval print "$mgr:> exit." def _checkFinishCondition(self): '''@return: true if all can stop.''' # Finish Condition. if self.waiting_to_finish and not self.pause: # Provider report finish and not paused. if self.busy_semaphore == 0: # all threads' status must be idle. if self.store.person_queue.empty() \ and len(self.store.pubmap) == 0 \ and len(self.store.pub_db_cache) == 0: # task queue must be empty left = self.pubDao.getLeftCount(self.generation) if left == 0: # really finished. return True return False def _maintainThreadPool(self, reload_all_thread): ''' Maintain ThreadPool, detect and restart, and set running threads on the fly. ''' # Collect Information. num_persont_alive = 0 num_pubt_alive = 0 if reload_all_thread: # kill all thread first. for idx_pub_t in range(0, self.max_pub_thread): t = None if len(self.pub_thread_pool) <= idx_pub_t: self.pub_thread_pool.append(t) else: t = self.pub_thread_pool[idx_pub_t] if t is not None: t.ask_to_stop = True self.pub_thread_pool = [] # check and start all unstarted threads. idx_person_t = 0 for idx_person_t in range(0, self.max_person_thread): t = None if len(self.person_thread_pool) <= idx_person_t: self.person_thread_pool.append( t) # if len less than max size, increase with None. else: t = self.person_thread_pool[idx_person_t] if t is None or not t.is_alive(): # if is None(new add) or dead. if self.running: t = PersonProcessThread(self) t.name = 'person-thread-' + str(idx_person_t) self.person_thread_pool[idx_person_t] = t t.start() num_persont_alive += 1 else: num_persont_alive += 1 # kill threads if needed. for i in range(idx_person_t, len(self.person_thread_pool) - 1): #@UnusedVariable t = self.person_thread_pool.pop(idx_person_t) t.stop() print "$mgr/thread:> kill thread %s" % t.name # check and start all unstarted threads. idx_pub_t = 0 for idx_pub_t in range(0, self.max_pub_thread): t = None if len(self.pub_thread_pool) <= idx_pub_t: self.pub_thread_pool.append(t) else: t = self.pub_thread_pool[idx_pub_t] if t is None or not t.is_alive(): if self.running: t = PubProcessThread(self) t.name = 'pub-thread-' + str(idx_pub_t) self.pub_thread_pool[idx_pub_t] = t t.start() num_pubt_alive += 1 else: num_pubt_alive += 1 # kill threads if needed. for i in range(idx_pub_t, len(self.pub_thread_pool) - 1): #@UnusedVariable t = self.pub_thread_pool.pop(idx_pub_t) t.stop() print "$mgr/thread:> kill thread %s" % t.name return (num_persont_alive, num_pubt_alive)
class DebugSuit(): def __init__(self): self.extractor = Extractor.getInstance() self.matcher = PubMatcher.getInstance() self.pubdao = PublicationDao() def debug_person(self, person_id, person_name, generation): '''Test method extract_from_source.''' print '- DEBUG Person "%s" -:' % person_name pubs = self.pubdao.getPublicationByPerson(person_id, generation) all_models = self.extractor.getNodesByPersonName(person_name) # if True:#print all all_models # print '-' * 100, 'This is all_models' # for key, models in all_models.items(): # print key, ':' # for model in models: # print '\t', model.readable_title, '(', model, ')' # print '=' * 100 , 'all_models print done' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '|||||||||||||||||||||||||||| get by pubs ' # todo here should be a while query, used_pubs = Extractor.pinMaxQuery(pubs_notfound) print '%s pub, query: %s' % (len(used_pubs), query) all_models = self.extractor.getNodesByPubs(used_pubs) (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- END DEBUG -' def debug_pubs(self): '''Debug get by pub''' print '-TEST-:', self.debug_pubs.__doc__.strip() #---------------------------------------------------- pub_candidates = [] # group 1 # pub_candidates.append(Publication(-1, 2000, 'Some Reflections on Proof Transformations', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Theorem Proving via General Mappings', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'Connections and Higher-Order Logic', "pubkey", -1, "Peter B. Andrews", -5)) # pub_candidates.append(Publication(-1, 2000, 'The TPS Theorem Proving System', "pubkey", -1, "Peter B. Andrews,Sunil Issar,Dan Nesmith,Frank Pfenning", -5)) # group 2 # pub_candidates.append(Publication(-1, 2000, 'Linearizable concurrent objects', "pubkey", -1, "MP Herlihy, JM Wing", -5)) # pub_candidates.append(Publication(-1, 2000, 'Protein structure prediction using a combination of sequence homology and global energy minimization I. Global energy minimization of surface loops', "pubkey", -1, "MJ Dudek, HA Scheraga", -5)) # group 3 # pub_candidates.append(Publication(-1, 2000, 'Implementation of Prolog databases and database operation builtins in the WAM-Plus model', "pubkey", -1, "Z Chenxi, C Yungui, L Bo", -5)) # group 4 pub_candidates.append( Publication( -1, 2000, 'Procedural Semantics for Fuzzy Disjunctive Programs on Residuated Lattices', "pubkey", -1, "Dusan Guller", -5)) extractor = Extractor.getInstance() query, used_pubs = Extractor.pinMaxQuery(pub_candidates) print '%s pub, query: %s' % (len(used_pubs), query) # # Get WEB PAGE # use_web = True # *************** if use_web: all_models = extractor.getNodesByPubs(used_pubs) else: f = file('debug_pubs.txt', 'r') html = f.read() models = self.extractor.extract_from_source(html) all_models = self.extractor._Extractor__merge_into_extractedmap( None, models) print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -'
def __init__(self, aid, generation): self.aid = aid self.generation = generation self.person = self.get_author(aid, generation) self.pubdao = PublicationDao()