def get_context(url, matchtext, before, after): html = get_cached_url(url).read() textsegments = html_to_text(html) i = textsegments.find(matchtext) bigtext = textsegments[max(0, i - before):min(i + after, len(textsegments))] return trim_to_words(bigtext)
def get_boss(query, start=0, count=10): url = get_boss_url(query, start, count) dom = XML(uc.get_cached_url("boss", url, pause=True)) realstart = dom.find("resultset_web").attr("start") if int(realstart) == start: return dom.findAll("result") else: return None
def get_boss(query,start=0,count=10): url = get_boss_url(query,start,count) dom = XML(uc.get_cached_url("boss",url)) realstart = dom.find("resultset_web").attr("start") if int(realstart) == start: return dom.findAll("result") else: return None
def run(self): global totaldownloaded global totalfiles global urls global timeouts print "thread running" while len(urls) > 0: url = urls.pop() if url.endswith("pdf"): continue try: content = uc.get_cached_url("pages",url,400000,2).read() totaldownloaded += len(content) totalfiles += 1 if totalfiles % 10 == 0: print "size:",len(content),"avg:",(totaldownloaded/totalfiles),"tot:",totaldownloaded,"cnt:",totalfiles,"tmo:",timeouts,"url:",url[:50] except: timeouts += 1
def run(self): global totaldownloaded global totalfiles global urls global timeouts print "thread running" while len(urls) > 0: url = urls.pop() if url.endswith("pdf"): continue try: content = uc.get_cached_url("pages", url, 400000, 2).read() totaldownloaded += len(content) totalfiles += 1 if totalfiles % 10 == 0: print "size:", len(content), "avg:", ( totaldownloaded / totalfiles ), "tot:", totaldownloaded, "cnt:", totalfiles, "tmo:", timeouts, "url:", url[: 50] except: timeouts += 1
def get_context(url,matchtext,before,after): html = get_cached_url(url).read() textsegments = html_to_text(html) i = textsegments.find(matchtext) bigtext = textsegments[max(0,i-before):min(i+after,len(textsegments))] return trim_to_words(bigtext)
def boss_counts_for_pattern(pattern): """get the total number of hits for a pattern, and also download the first 50""" url = boss.get_boss_url(pattern, 0, 50) dom = XML(uc.get_cached_url("boss", url)) hitcount = dom.find("resultset_web").attr("totalhits") return int(hitcount)
def boss_counts_for_pattern(pattern): """get the total number of hits for a pattern, and also download the first 50""" url = boss.get_boss_url('"'+pattern+'"',0,50) dom = XML(uc.get_cached_url("boss",url)) hitcount = dom.find("resultset_web").attr("deephits") return int(hitcount)