def test_retrieve_html(self): print 'Test1: test_retrieve_html()' url = '''allintitle:"Augmenting Branching Temporal Logics with Existential Quantification over Atomic Propositions" OR "Branching-Depth Hierarchies" OR "On the Relative Succinctness of Nondeterministic Buchi and co-Buchi Word Automata"''' url2 = "http://scholar.google.com/scholar?hl=en&num=100&q=%s" % url url2 = URLCleaner.encodeUrlForDownload(url2) url2 = '''http://scholar.google.com/scholar?hl=en&num=100&as_subj=eng&q=%22Finding%20the%20Number%20of%20Factors%20of%20a%20Polynomial%22OR%22Probabilistic%20Models%20of%20Database%20Locking:%20Solutions,%20Computational%20Algorithms,%20and%20Asymptotics%22OR%22The%20AWK%20Programming%20Language%22OR%22Factoring%20Polynomials%20Over%20Algebraic%20Number%20Fields%22''' getter = HtmlRetriever(use_proxy=False) print getter.getHtmlRetry(url2, 1)
class TestCase(): def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker() def test_parse_google_result(self, title1, title2): '''Test method extract_from_source.''' print '-TEST-:', self.test_parse_google_result.__doc__.strip() url = self.checker.pinQuery(title1, title2); print '> url', '-' * 100 print url html = self.htmlRetriever.getHtmlRetry(url, 3, False); print '> html', '-' * 100 print html[0:100] print '\n' print '> blocks', '-' * 100 models = self.parsegoogle.extract_from_source(html) for model in models: print model print '-END TEST-'
class TestCase(): def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker() def test_parse_google_result(self, title1, title2): '''Test method extract_from_source.''' print '-TEST-:', self.test_parse_google_result.__doc__.strip() url = self.checker.pinQuery(title1, title2) print '> url', '-' * 100 print url html = self.htmlRetriever.getHtmlRetry(url, 3, False) print '> html', '-' * 100 print html[0:100] print '\n' print '> blocks', '-' * 100 models = self.parsegoogle.extract_from_source(html) for model in models: print model print '-END TEST-'
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I) self.pdf_block = re.compile('<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>', re.I) self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>', re.I)
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) if self.settings.save_pdflink: self.pdfcache = PDFLinkSaver.getInstance() self.author = re.compile('<div class="?gs_a"?>([^\\x00]+?) - ', re.I) self.pdf_block = re.compile( '<div class="?gs_ggs gs_fl"?><a href="?([^\s"]+)?"?[^>]+?><span class="?gs_ctg2"?>\[PDF\]</span>', re.I) self.citation_block = re.compile('<div class="?gs_fl"?>.*?</div>', re.I)
def mgrThreadBody(self): "Management Thread" print "#init:> start mgr & provider." getter = HtmlRetriever.getInstance(self.settings.use_proxy) while self.running or not self.stopped: # interval seconds passed. interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds if interval_seconds == 0: interval_seconds = 1 self.last_report_time = datetime.datetime.now(); try: self.PersonThreadActive = 0 self.PubThreadActive = 0 for x in self.person_thread_pool: if x.check_idle(): self.PersonThreadActive += 1 for y in self.pub_thread_pool: if y.check_idle(): self.PubThreadActive += 1 except Exception: print "ERROR:count errer" print Exception try: # save pdf link if self.settings.save_pdflink: self.pdfcache.flush() except Exception: print "ERROR: pdf link" print Exception message = None # 什么时候重启所有线程&进程 reload_all_thread = False if self.num_report % 1000 == 0: reload_all_thread = True message = "Kill & Restart All Thread." try: # Maintain Threads and get worker threads status. (num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread=False) except Exception: print "ERROR: maintain threads and worker" print Exception try: # Finish Condition. if self._checkFinishCondition(): self.running = False # -> tell all threads finish. message = "MESSAGE! Send terminal signal to all worker thread." except Exception: print "ERROR: condition check" print Exception # if all worker threads stopped, mgrThread can stop. if num_persont_alive == 0 and num_pubt_alive == 0: self.stopped = True message = "Send terminal signal to mgr_thread." # check network and count period_success_connection = getter.success_connection_count - getter.last_success_connection_count period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count total_connections = period_success_connection + period_bad_connection getter.last_success_connection_count = getter.success_connection_count getter.last_bad_connection_count = getter.bad_connection_count average_success_persecond = period_success_connection / float(interval_seconds) average_bad_persecond = period_bad_connection / float(interval_seconds) if False: # 是否Block模式,就是暂停整个程序 if getter.detect_mode: if getter.detect_success_count > 3: getter.leave_detect_mode() self.detect_exit_wait = 1 # 刚出来时,下两轮都不要再进入block模式了。 else: if total_connections * 0.9 < period_bad_connection: if self.detect_exit_wait > 0: print "---- waiting %s rounds ----" % self.detect_exit_wait self.detect_exit_wait -= 1 else: getter.enter_detect_mode() ################ print interval string ################ try: # print report if not getter.detect_mode: str_report = None if not self.pause: self.num_report += 1 str_report = self.num_report else: str_report = "paused" report_strs = [] report_strs.append("-" * 100) report_strs.append("\n") report_strs.append("$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report)) report_strs.append("Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize())) report_strs.append("Pub(%sT on %s, %s items), " % (num_pubt_alive, len(self.store.pubmap), len(self.store.person_pub_map))) report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache)) report_strs.append("T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.settings.max_person_thread + self.settings.max_pub_thread - self.busy_semaphore)) report_strs += '\n' report_strs.append("Person(busy/idle)(%s/%s), Pub(busy/idle)(%s/%s)" % (self.busy_person_semaphore, self.settings.max_person_thread-self.busy_person_semaphore, self.busy_pub_semaphore, self.settings.max_pub_thread-self.busy_pub_semaphore)) g = getter.success_connection_count b = getter.bad_connection_count t = g + b rate = 0 if(t > 0): rate = g / float(t) report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b , t, rate)) report_strs.append("interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections)) report_strs.append("avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds)) report_strs.append("\n") report_strs.append("now have %s child threads, " % self.threadChildren) report_strs.append("active threads (%s person, %s pub) , " % (self.PersonThreadActive, self.PubThreadActive)) report_strs.append("\n") report_strs.append("time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get)) if message is not None: report_strs.append("\n") report_strs.append(message) report_strs.append("\n") report_strs.append(" * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation)) report_strs.append(" * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation)) report_strs.append("-" * 100) report_strs.append("\n") print "".join(report_strs) if (self.num_report%100 == 0): mr = MailReporter() mr.report(report_strs) except Exception: print "ERROR: report error" print Exception try: self.store.flushDBCache() # last flush cache to db. self.store.running = self.running # pass main running thread to Store object. except Exception: print "ERROR: flush db cache" print Exception time.sleep(self.mgr_interval) # interval print "$mgr:> exit."
def __init__(self): self.settings = Settings.getInstance() self.debug = self.settings.debug self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy)
def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) #self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser()
def mgrThreadBody(self): '''Management Thread ''' print "$init:> start mgr & provider." getter = HtmlRetriever.getInstance(self.settings.use_proxy) while self.running or not self.stopped: # interval seconds passed. interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds if interval_seconds == 0: interval_seconds = 1 self.last_report_time = datetime.datetime.now(); # -------------------------------------------------------- # strength by period of day. hour = datetime.datetime.now().hour if hour <= 9: # 12h-9h self.max_person_thread = 25 self.max_pub_thread = 75 elif 22 <= hour: # 9h-22h self.max_person_thread = 16 self.max_pub_thread = 40 else: # 22h-24h self.max_person_thread = 22 self.max_pub_thread = 60 self.max_person_thread = 2 self.max_pub_thread = 2 # -------------------------------------------------------- # save pdf link if self.settings.save_pdflink: self.pdfcache.flush() # message message = None # 什么时候重启所有线程&进程。 reload_all_thread = False if self.num_report % 1000 == 0: reload_all_thread = True message = "Kill & Restart All Thread." # Maintain Threads and get worker threads status. (num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread) # Finish Condition. if self._checkFinishCondition(): self.running = False # -> tell all threads finish. message = "MESSAGE! Send terminal signal to all worker thread." # if all worker threads stopped, mgrThread can stop. if num_persont_alive == 0 and num_pubt_alive == 0: self.stopped = True message = "Send terminal signal to mgr_thread." # check network and count period_success_connection = getter.success_connection_count - getter.last_success_connection_count period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count total_connections = period_success_connection + period_bad_connection getter.last_success_connection_count = getter.success_connection_count getter.last_bad_connection_count = getter.bad_connection_count average_success_persecond = period_success_connection / float(interval_seconds) average_bad_persecond = period_bad_connection / float(interval_seconds) if False: # 是否Block模式,就是暂停整个程序 if getter.detect_mode: if getter.detect_success_count > 3: getter.leave_detect_mode() self.detect_exit_wait = 1 # 刚出来时,下两轮都不要再进入block模式了。 else: if total_connections * 0.9 < period_bad_connection: if self.detect_exit_wait > 0: print "---- waiting %s rounds ----" % self.detect_exit_wait self.detect_exit_wait -= 1 else: getter.enter_detect_mode() # print report if not getter.detect_mode: str_report = None if not self.pause: self.num_report += 1 str_report = self.num_report else: str_report = "paused" #-------------------------------------------------------------------------------- # print interval string. report_strs = [] report_strs.append("-" * 100) report_strs.append("\n") report_strs.append("$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report)) report_strs.append("Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize())) report_strs.append("Pub(%sT on %s), " % (num_pubt_alive, len(self.store.pubmap))) report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache)) report_strs.append("T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.max_person_thread + self.max_pub_thread - self.busy_semaphore)) report_strs.append("\n") g = getter.success_connection_count b = getter.bad_connection_count t = g + b rate = 0 if(t > 0): rate = g / float(t) report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b , t, rate)) report_strs.append("interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections)) report_strs.append("avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds)) report_strs.append("\n") report_strs.append("time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get)) if message is not None: report_strs.append("\n") report_strs.append(message) report_strs.append("\n") report_strs.append(" * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation)) report_strs.append(" * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation)) report_strs.append("-" * 100) report_strs.append("\n") print "".join(report_strs) #-------------------------------------------------------------------------------- # flush db cache self.store.flushDBCache() # last flush cache to db. self.store.running = self.running # pass main running thread to Store object. time.sleep(self.mgr_interval) # interval print "$mgr:> exit."
def test_retrieve_html2(self): url = '''allintitle:"Augmenting Branching Temporal Logics with Existential Quantification over Atomic Propositions" OR "Branching-Depth Hierarchies" OR "On the Relative Succinctness of Nondeterministic Buchi and co-Buchi Word Automata"''' url2 = "http://scholar.google.com/scholar?hl=en&num=100&q=%s" % url url2 = URLCleaner.encodeUrlForDownload(url2) getter = HtmlRetriever(use_proxy=True) print getter.getHtmlRetry(url2, 1)
class checker(): def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) #self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser() def pinAjaxQuery(self, title1, title2): query = "".join(('"', title1, '" AND "', title2 , '"')) query = query.replace(" ", "%20").replace("\"", "%22") url = self.settings.ajaxtemplate % query return url def isInSamePageAjax(self, title1, title2, withProxy=False): url = self.pinAjaxQuery(title1, title2) print '[GET]:%s' % url json_result = None while json_result is None: json_result = self.htmlRetriever.getHTMLByGoogleAjax(url, 3, withProxy) max_pages = 4 # 4 * 4 = 16 pages = 0 if 'pages' in json_result['responseData']['cursor']: pages = len(json_result['responseData']['cursor']['pages']) if max_pages > pages: max_pages = pages current_page = 0 while(current_page < max_pages): json = None if current_page == 0: json = json_result else: start = current_page * 4 page_url = "".join((url, "&start=%s" % start)) print '[GET]:%s' % page_url while json is None: json = self.htmlRetriever.getHTMLByGoogleAjax(page_url, 3, withProxy) # process json google_results = json['responseData']['results'] if len(google_results): for gr in google_results: print "--- ", gr if gr['visibleUrl'] in EXCLUDE_NOISE_SITE: print '+ found domain:', gr['visibleUrl'] return True # else: # print '- found domain:', gr['visibleUrl'] # return False current_page += 1 return False def foundInSamePage(self, url): idx = url.find("/"); domain = '' if idx > 0: domain = url[0:idx] if domain in EXCLUDE_NOISE_SITE: return True, domain else: return False, domain
def __init__(self): self.settings = Settings.getInstance() self.parsegoogle = GoogleResultParser() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) self.checker = checker()
class checker(): def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever(self.settings.use_proxy) #self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser() def pinAjaxQuery(self, title1, title2): query = "".join(('"', title1, '" AND "', title2, '"')) query = query.replace(" ", "%20").replace("\"", "%22") url = self.settings.ajaxtemplate % query return url def isInSamePageAjax(self, title1, title2, withProxy=False): url = self.pinAjaxQuery(title1, title2) print '[GET]:%s' % url json_result = None while json_result is None: json_result = self.htmlRetriever.getHTMLByGoogleAjax( url, 3, withProxy) max_pages = 4 # 4 * 4 = 16 pages = 0 if 'pages' in json_result['responseData']['cursor']: pages = len(json_result['responseData']['cursor']['pages']) if max_pages > pages: max_pages = pages current_page = 0 while (current_page < max_pages): json = None if current_page == 0: json = json_result else: start = current_page * 4 page_url = "".join((url, "&start=%s" % start)) print '[GET]:%s' % page_url while json is None: json = self.htmlRetriever.getHTMLByGoogleAjax( page_url, 3, withProxy) # process json google_results = json['responseData']['results'] if len(google_results): for gr in google_results: print "--- ", gr if gr['visibleUrl'] in EXCLUDE_NOISE_SITE: print '+ found domain:', gr['visibleUrl'] return True # else: # print '- found domain:', gr['visibleUrl'] # return False current_page += 1 return False def foundInSamePage(self, url): idx = url.find("/") domain = '' if idx > 0: domain = url[0:idx] if domain in EXCLUDE_NOISE_SITE: return True, domain else: return False, domain
''' Created on Nov 2, 2011 @author: bogao ''' from com.lish.ajia.util.web import HtmlRetriever class TestWebGet: pass if __name__ == '__main__': url = 'http://scholar.google.com/scholar?hl=en&num=100&q=%22A%20fast%20algorithm%20for%20computing%20distance%20spectrum%20of%20convolutional%20codes.%22OR%22A%20new%20upper%20bound%20on%20the%20first-event%20error%20probability%20for%20maximum-likelihood%20decoding%20of%20fixed%20binary%20convolutional%20codes.%22' htmlRetriever = HtmlRetriever.getInstance(False) html = htmlRetriever.getHtmlRetry(url) print html[0:300], "..."
def mgrThreadBody(self): '''Management Thread ''' print "$init:> start mgr & provider." getter = HtmlRetriever.getInstance(self.settings.use_proxy) while self.running or not self.stopped: # interval seconds passed. interval_seconds = (datetime.datetime.now() - self.last_report_time).seconds if interval_seconds == 0: interval_seconds = 1 self.last_report_time = datetime.datetime.now() # -------------------------------------------------------- # strength by period of day. hour = datetime.datetime.now().hour if hour <= 9: # 12h-9h self.max_person_thread = 25 self.max_pub_thread = 75 elif 22 <= hour: # 9h-22h self.max_person_thread = 16 self.max_pub_thread = 40 else: # 22h-24h self.max_person_thread = 22 self.max_pub_thread = 60 self.max_person_thread = 2 self.max_pub_thread = 2 # -------------------------------------------------------- try: # save pdf link if self.settings.save_pdflink: self.pdfcache.flush() except e: print "ERROR: pdf link" print e # message message = None # 什么时候重启所有线程&进程。 reload_all_thread = False if self.num_report % 1000 == 0: reload_all_thread = True message = "Kill & Restart All Thread." try: # Maintain Threads and get worker threads status. (num_persont_alive, num_pubt_alive) = self._maintainThreadPool(reload_all_thread) except e: print "ERROR: maintain threads and worker" print e try: # Finish Condition. if self._checkFinishCondition(): self.running = False # -> tell all threads finish. message = "MESSAGE! Send terminal signal to all worker thread." except e: print "ERROR: condition check" print e # if all worker threads stopped, mgrThread can stop. if num_persont_alive == 0 and num_pubt_alive == 0: self.stopped = True message = "Send terminal signal to mgr_thread." # check network and count period_success_connection = getter.success_connection_count - getter.last_success_connection_count period_bad_connection = getter.bad_connection_count - getter.last_bad_connection_count total_connections = period_success_connection + period_bad_connection getter.last_success_connection_count = getter.success_connection_count getter.last_bad_connection_count = getter.bad_connection_count average_success_persecond = period_success_connection / float( interval_seconds) average_bad_persecond = period_bad_connection / float( interval_seconds) if False: # 是否Block模式,就是暂停整个程序 if getter.detect_mode: if getter.detect_success_count > 3: getter.leave_detect_mode() self.detect_exit_wait = 1 # 刚出来时,下两轮都不要再进入block模式了。 else: if total_connections * 0.9 < period_bad_connection: if self.detect_exit_wait > 0: print "---- waiting %s rounds ----" % self.detect_exit_wait self.detect_exit_wait -= 1 else: getter.enter_detect_mode() try: # print report if not getter.detect_mode: str_report = None if not self.pause: self.num_report += 1 str_report = self.num_report else: str_report = "paused" #-------------------------------------------------------------------------------- # print interval string. report_strs = [] report_strs.append("-" * 100) report_strs.append("\n") report_strs.append( "$&mgr:%s(%s):> " % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), str_report)) report_strs.append( "Person(%sT on %s), " % (num_persont_alive, self.store.person_queue.qsize())) report_strs.append( "Pub(%sT on %s), " % (num_pubt_alive, len(self.store.pubmap))) report_strs.append("DBCache({{{ %s }}}), " % len(self.store.pub_db_cache)) report_strs.append( "T(busy/idle)(%s/%s), " % (self.busy_semaphore, self.max_person_thread + self.max_pub_thread - self.busy_semaphore)) report_strs.append("\n") g = getter.success_connection_count b = getter.bad_connection_count t = g + b rate = 0 if (t > 0): rate = g / float(t) report_strs.append("network(g+b=t)=(%s+%s=%s),rate=%.2f " % (g, b, t, rate)) report_strs.append( "interval-network(g+b=t)=(%s+%s=%s), " % (period_success_connection, period_bad_connection, total_connections)) report_strs.append( "avg:(g%.1f b%.1f in %s seconds.), " % (average_success_persecond, average_bad_persecond, interval_seconds)) report_strs.append("\n") report_strs.append( "time:(wait=%.2f, getlock=%.2f, get=%.2f)" % (self.store.ppt_wait, self.store.ppt_getlock, self.store.ppt_get)) if message is not None: report_strs.append("\n") report_strs.append(message) report_strs.append("\n") report_strs.append( " * Process NA Persons : %s.\n" % self.reportPersonProgress(self.generation)) report_strs.append( " * Process Publication: %s.\n" % self.reportPublicationProgress(self.generation)) report_strs.append("-" * 100) report_strs.append("\n") print "".join(report_strs) #-------------------------------------------------------------------------------- except e: print "ERROR: report error" print e try: # flush db cache self.store.flushDBCache() # last flush cache to db. self.store.running = self.running # pass main running thread to Store object. except e: print "ERROR: flush db cache" print e time.sleep(self.mgr_interval) # interval print "$mgr:> exit."
def __init__(self): self.debug_print = True self.settings = Settings.getInstance() self.htmlRetriever = HtmlRetriever.getInstance(self.settings.use_proxy) self.htmlRetriever.validate_html_callback = self.validate_html_callback self.parsegoogle = GoogleResultParser()