Exemplo n.º 1
0
def download_loop():
    global outstore
    global backoff
    global errors
    global running
    global timeouts
    global totalfiles
    global started

    #	backoff = 0
    while running:
        lock.acquire()
        line = infile.next().split("\t")
        lock.release()
        try:
            (date, url, query) = [nlptools.trim_ends(x) for x in line]
        except:
            print "badline:", line
        if backoff > 0:
            time.sleep(backoff)
        if url.endswith("pdf"): continue
        try:
            if not started:
                started = time.time()
            urlin = urllib2.urlopen(url, None, 4)
            content = urlin.read(400000)
            urlin.close()
            lock.acquire()
            getclaims.process_content(outstore, date, url, query, content)
            lock.release()
            totalfiles += 1
            rate = totalfiles / (time.time() - started)
            if totalfiles % 100 == 0:
                print "rate:", rate, "bak:", backoff, "err:", errors, "cnt:", totalfiles, "tmo:", timeouts
                if backoff > 0:
                    backoff -= 1
        except Exception as e:
            dsc = e.__str__()
            print e
            lock.acquire()
            outstore.emit(url, {'error': dsc})
            lock.release()
            if "timed out" in dsc: timeouts += 1
            if "name resolution" in dsc:
                print "-- name resolution overload, backing off --"
                backoff += 1
                print "backoff = ", backoff
Exemplo n.º 2
0
def download_loop():
	global outstore
	global backoff
	global errors
	global running
	global timeouts
	global totalfiles
	global started
		
#	backoff = 0
	while running:
		lock.acquire()
		line = infile.next().split("\t")
		lock.release()
		try:
			(date,url,query) = [nlptools.trim_ends(x) for x in line]
		except:
			print "badline:",line
		if backoff > 0:
			time.sleep(backoff)
		if url.endswith("pdf"): continue
		try:
			if not started:
				started = time.time()
			urlin = urllib2.urlopen(url,None,4)
			content = urlin.read(400000)
			urlin.close()
			lock.acquire()
			getclaims.process_content(outstore,date,url,query,content)
			lock.release()
			totalfiles += 1
			rate = totalfiles/(time.time()-started)
			if totalfiles % 100 == 0:
				print "rate:",rate,"bak:",backoff,"err:",errors,"cnt:",totalfiles,"tmo:",timeouts
				if backoff > 0:
					backoff -= 1
		except Exception as e:
			dsc = e.__str__()
			print e
			lock.acquire()
			outstore.emit(url,{'error':dsc})
			lock.release()
			if "timed out" in dsc: timeouts+=1
			if "name resolution" in dsc: 
				print "-- name resolution overload, backing off --"
				backoff +=1
				print "backoff = ",backoff
Exemplo n.º 3
0
def process_urls_from_file(filename,outname):	
	outstore = mapreduce.OutStore(outname)
	for line in file(filename):
		(date,url,query) = [nlptools.trim_ends(x) for x in line.split("\t")]
		print "url",url
		process_url(outstore,date,url,query)			
Exemplo n.º 4
0
def classify_good_both(row):
	return classify_good_wiki(row) and f.is_good(nt.trim_ends(row[2]))
Exemplo n.º 5
0
def process_urls_from_file(filename, outname):
    outstore = mapreduce.OutStore(outname)
    for line in file(filename):
        (date, url, query) = [nlptools.trim_ends(x) for x in line.split("\t")]
        print "url", url
        process_url(outstore, date, url, query)