Пример #1
0
def init_task(mq, link):
    lines = csv.reader(file(link, 'rb'))
    nd = {}
    cd = {}
    for line in lines:
        url = line[-1]
        if not cd.has_key(url):
            cd[url] = 0
        else:
            continue
        filename = 'japan' + '/' + line[2]
        if not nd.has_key(filename):
            nd[filename] = 0
        else:
            nd[filename] += 1
            filename = filename + '_' + str(nd[filename])
        task = etask([url, filename, 0]).get_task()
        mq._cqs.put(task)

    while 1:
        try:
            task = mq._cqs.get(True, 10)
        except Exception, e:
            logger.info('...get cqs task out of time...')
        else:
            logger.info('...run a task...')
            result = paser_page.apply_async(args=[task],
                                            queue='machine1',
                                            routing_key='machine1')
            #result = result.get()
            #print result
            try:
                mq._cqx.put(result)
            except Exception, e:
                logger.error('...put cqx result error...')
Пример #2
0
    def res_handle(self,result):
	    request = result['task']['oldinfo']
	    task = result['task']
	    data = result['data']
	    logger.info('flag is '+str(result['flag']))
	    if result['flag'] :
	    	self.product_pooltask(task)
	    	self.process_data(data)
	    	#self.keep_file(html)
	    	#self.keep_mongo(html)
	    else:
		self.process_fail(request)
Пример #3
0
    def process_data(self, pooldata):
	nlst = pooldata['filename'].split('/')[1].split('__')
	path = []
	k=''
	for n in nlst:
	    if k != n:
		k = n
	   	path.append(k)
	empty = ['filename']
	for key in pooldata:
	    if pooldata[key] == []:
		empty.append(key)
	for key in empty:
	    del pooldata[key]

	tmp = '&&'.join(path)+'###'+str(pooldata)+'\n'
	logger.info('write line to data3')
	self.cra_d.write(tmp)
	self.cra_d.flush()
Пример #4
0
    def load_by_mc(self, spi, url, name):
        refer = spi['refer']
        mode = spi['mode']
        spi_url = spi['spi_url']
        post_type = spi['post_type']
        post_data = spi['post_data']
        post_url = spi['post_url']
        isproxy = spi['isproxy']
        debug = spi['debug']

        key = name.split('__')[0]
        if self.objmc.has_key(key):
            mc = self.objmc[key]
        else:
            mc = MC()
            mc.set_debug(debug)
            self.objmc[key] = mc

        if isproxy != '':
            p = get_proxy(source='citytraffic')
            if not p:
                logger.error('get proxy error ... %s', str(p))
            else:
                logger.info('this proxy is ... %s', str(p))
                mc.set_proxy(p)
        if refer != '':
            mc.add_referer(self.refer)
        if spi_url != '':
            mc.req(self.mode, self.spi_url)

        try:
            if mode == 'post':
                page = mc.req(mode,
                              post_url,
                              paras=post_data,
                              paras_type=post_type,
                              html_flag=True)
            else:
                page = mc.req(mode, url, html_flag=True)
        except Exception, e:
            #traceback.print_exc(e)
            logger.error('load by mc ...<-!error::%s!-> <-!proxy::%s!->',
                         traceback.format_exc(e), str(p))
Пример #5
0
    def thread_handle(self):		
	logger.info('...thread_handle start...')
	while True:
	    
	    try:
		res = self._cqx.get(True, self.__resq_timeout)
	    except Exception ,e:
		'''队列开始空了'''
		logger.info('result queue cqx is empty')
	    else:
		#logger.info('running task')
		try:
			ged = res.get(timeout = 1)
		except:
			try:
				self._cqx.put(res)
			except:
				logger.error('time out task put cqx again error ...')
		else:
			self.res_handle(ged)
Пример #6
0
    def __del__(self):
	self.fail.close()
	self.cra_d.close()
	logger.info('process running down ...')
	'''
Пример #7
0
def paser_page( kwds):
	
	realtime = []
	spacetime = []
	lname = []
	sname = []
	type = []
	url_res = []
	coordinates = []
	dl = download()
	task = kwds
	key_l = {
		'lname':[],\
		'sname':[],\
		'type':[],\
		'coordinates':[],\
		'realtime':[],\
		'spacetime':[],\
		'url_res':[]\
		}

	task_url = task['info'][0]	
	path     = task['info'][1]	
	filename = path.split('/')[0]+'/'+md5(path.split('/')[1])
	city_name = path.split('/')[0]
	step     = task['info'][2]	
	exp_act  = task['exp_act'][step]
	spi_act  = task['spi_act'][step]
	temp = dl.temp
	count = 0
	flag  = True
	p = '0.0.0.0:0'
	for key, value in exp_act.items():
	    if count == 0:
		i = 0
		while key_l[key] == []:
		    i += 1
		    if i > MAX:
	    		logger.info('a task fail ::%s',str(task['info']))
			flag = False
			break
		    try:
		    	if os.path.exists(filename) and Islocal:
		    	    with open(filename,'r') as file :	
			    	page = file.read()
			else:
		    	    if spi_act['way'].lower() == 'req':
		            	page,p = dl.load_by_request(spi_act,task_url,filename)
		    	    elif spi_act['way'].lower() == 'mc':
		            	page,p = dl.load_by_mc(spi_act,task_url,filename)
		        exec(key+'='+value)
		    except Exception, e:
	    		logger.error('a error spider :: <-!error::%s!-> <-!task::%s!-> <-!proxy::%s!->',traceback.format_exc(e),str(task['info']),str(p))
		    else:
		    	logger.info('task success :: <-!task::%s!-> <-!proxy::%s!->',str(task['info']),str(p))

		    if key == 'lname':
		    	key_l[key] = lname
		    elif key == 'sname':
			key_l[key] = sname
		    elif key == 'type':
			key_l[key] = type
		    elif key == 'spacetime':
			key_l[key] = spacetime
		    elif key == 'realtime':
			key_l[key] = realtime
		    elif key == 'coordinates':
			key_l[key] = coordinates
		    elif key == 'url_res':
			key_l[key] = url_res
	    else:
		if i >= MAX:
		    break
		try:
		    exec(key+'='+value)
		except Exception, e:
	    	    logger.error('not first validate error :: <-!error::%s!-> <-!task::%s!->',traceback.format_exc(e),str(task['info']))