def download_loop(): global outstore global backoff global errors global running global timeouts global totalfiles global started # backoff = 0 while running: lock.acquire() line = infile.next().split("\t") lock.release() try: (date, url, query) = [nlptools.trim_ends(x) for x in line] except: print "badline:", line if backoff > 0: time.sleep(backoff) if url.endswith("pdf"): continue try: if not started: started = time.time() urlin = urllib2.urlopen(url, None, 4) content = urlin.read(400000) urlin.close() lock.acquire() getclaims.process_content(outstore, date, url, query, content) lock.release() totalfiles += 1 rate = totalfiles / (time.time() - started) if totalfiles % 100 == 0: print "rate:", rate, "bak:", backoff, "err:", errors, "cnt:", totalfiles, "tmo:", timeouts if backoff > 0: backoff -= 1 except Exception as e: dsc = e.__str__() print e lock.acquire() outstore.emit(url, {'error': dsc}) lock.release() if "timed out" in dsc: timeouts += 1 if "name resolution" in dsc: print "-- name resolution overload, backing off --" backoff += 1 print "backoff = ", backoff
def download_loop(): global outstore global backoff global errors global running global timeouts global totalfiles global started # backoff = 0 while running: lock.acquire() line = infile.next().split("\t") lock.release() try: (date,url,query) = [nlptools.trim_ends(x) for x in line] except: print "badline:",line if backoff > 0: time.sleep(backoff) if url.endswith("pdf"): continue try: if not started: started = time.time() urlin = urllib2.urlopen(url,None,4) content = urlin.read(400000) urlin.close() lock.acquire() getclaims.process_content(outstore,date,url,query,content) lock.release() totalfiles += 1 rate = totalfiles/(time.time()-started) if totalfiles % 100 == 0: print "rate:",rate,"bak:",backoff,"err:",errors,"cnt:",totalfiles,"tmo:",timeouts if backoff > 0: backoff -= 1 except Exception as e: dsc = e.__str__() print e lock.acquire() outstore.emit(url,{'error':dsc}) lock.release() if "timed out" in dsc: timeouts+=1 if "name resolution" in dsc: print "-- name resolution overload, backing off --" backoff +=1 print "backoff = ",backoff
def process_urls_from_file(filename,outname): outstore = mapreduce.OutStore(outname) for line in file(filename): (date,url,query) = [nlptools.trim_ends(x) for x in line.split("\t")] print "url",url process_url(outstore,date,url,query)
def classify_good_both(row): return classify_good_wiki(row) and f.is_good(nt.trim_ends(row[2]))
def process_urls_from_file(filename, outname): outstore = mapreduce.OutStore(outname) for line in file(filename): (date, url, query) = [nlptools.trim_ends(x) for x in line.split("\t")] print "url", url process_url(outstore, date, url, query)