def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I) self.pdf_block = re.compile('<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>', re.I) self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>', re.I)
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I) self.pdf_block = re.compile( '<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>', re.I) self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>', re.I)
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [ ] #= Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] #= Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
def __init__(self): print "Task: extract paper's citation from schooler.google.com.\n" self.settings = Settings.getInstance() self.debug = self.settings.debug # Configs self.mgr_interval = 10 # seconds self.max_person_thread = 2 # max threads used to extract person, self.max_pub_thread = 2 # these 2 values can modified on the fly. diff in day or night # Threads and configurations self.t_mgr = None # MgrThread(self) # management thread, create self.t_provider = None self.person_thread_pool = [] # = Queue.Queue(maxsize=self.max_person_thread) self.pub_thread_pool = [] # = Queue.Queue(maxsize=self.max_pub_thread) self.busy_semaphore = 0 # 用来监视是否所有的线程都处于Idle状态 self.busy_semaphore_lock = threading.Lock() # 用来监视是否所有的线程都处于Idle状态 # utils self.store = None # switchers & flags self.running = True # If False, threads will stop after current task. self.stopped = False # If MGRThread can stop. self.pause = False # All works paused. self.waiting_to_finish = False # No additional data. all added to queue. self.num_report = 0 self.last_report_time = datetime.datetime.now() # 上次Interval的时间 self.restart_all_thread = False self.detect_exit_wait = 0 # 当刚刚从pause模式退出来时,会有大量failed的任务,会导致立刻再次等待 self.generation = 0 self.dao = dbs() self.personDao = PersonDao() self.pubDao = PublicationDao() if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() # start self.determineGereration()
for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub(used_pubs, all_models) for pub in pubs_found: print 'pubs found' , pub print '-' * 100 for pub in pubs_notfound: print 'not found' , pub print '- test done -' if __name__ == '__main__': ''' top pub person, this is in local database. ''' debug = DebugSuit() # debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4) debug.debug_pubs() # end if Settings.getInstance().save_pdflink: PDFLinkSaver.getInstance().flush()
print '\n- all_models ----------------------' if all_models is not None: for key, models in all_models.items(): print key for model in models: print "\t", model else: print 'all_models is None' print '- all_models end ----------------------\n' (pubs_found, pubs_notfound) = PubMatcher.getInstance().matchPub( used_pubs, all_models) for pub in pubs_found: print 'pubs found', pub print '-' * 100 for pub in pubs_notfound: print 'not found', pub print '- test done -' if __name__ == '__main__': ''' top pub person, this is in local database. ''' debug = DebugSuit() # debug.debug_person(29463, 'Reihaneh Safavi-Naini', 4) debug.debug_pubs() # end if Settings.getInstance().save_pdflink: PDFLinkSaver.getInstance().flush()