def init_task(mq, link): lines = csv.reader(file(link, 'rb')) nd = {} cd = {} for line in lines: url = line[-1] if not cd.has_key(url): cd[url] = 0 else: continue filename = 'japan' + '/' + line[2] if not nd.has_key(filename): nd[filename] = 0 else: nd[filename] += 1 filename = filename + '_' + str(nd[filename]) task = etask([url, filename, 0]).get_task() mq._cqs.put(task) while 1: try: task = mq._cqs.get(True, 10) except Exception, e: logger.info('...get cqs task out of time...') else: logger.info('...run a task...') result = paser_page.apply_async(args=[task], queue='machine1', routing_key='machine1') #result = result.get() #print result try: mq._cqx.put(result) except Exception, e: logger.error('...put cqx result error...')
def res_handle(self,result): request = result['task']['oldinfo'] task = result['task'] data = result['data'] logger.info('flag is '+str(result['flag'])) if result['flag'] : self.product_pooltask(task) self.process_data(data) #self.keep_file(html) #self.keep_mongo(html) else: self.process_fail(request)
def process_data(self, pooldata): nlst = pooldata['filename'].split('/')[1].split('__') path = [] k='' for n in nlst: if k != n: k = n path.append(k) empty = ['filename'] for key in pooldata: if pooldata[key] == []: empty.append(key) for key in empty: del pooldata[key] tmp = '&&'.join(path)+'###'+str(pooldata)+'\n' logger.info('write line to data3') self.cra_d.write(tmp) self.cra_d.flush()
def load_by_mc(self, spi, url, name): refer = spi['refer'] mode = spi['mode'] spi_url = spi['spi_url'] post_type = spi['post_type'] post_data = spi['post_data'] post_url = spi['post_url'] isproxy = spi['isproxy'] debug = spi['debug'] key = name.split('__')[0] if self.objmc.has_key(key): mc = self.objmc[key] else: mc = MC() mc.set_debug(debug) self.objmc[key] = mc if isproxy != '': p = get_proxy(source='citytraffic') if not p: logger.error('get proxy error ... %s', str(p)) else: logger.info('this proxy is ... %s', str(p)) mc.set_proxy(p) if refer != '': mc.add_referer(self.refer) if spi_url != '': mc.req(self.mode, self.spi_url) try: if mode == 'post': page = mc.req(mode, post_url, paras=post_data, paras_type=post_type, html_flag=True) else: page = mc.req(mode, url, html_flag=True) except Exception, e: #traceback.print_exc(e) logger.error('load by mc ...<-!error::%s!-> <-!proxy::%s!->', traceback.format_exc(e), str(p))
def thread_handle(self): logger.info('...thread_handle start...') while True: try: res = self._cqx.get(True, self.__resq_timeout) except Exception ,e: '''队列开始空了''' logger.info('result queue cqx is empty') else: #logger.info('running task') try: ged = res.get(timeout = 1) except: try: self._cqx.put(res) except: logger.error('time out task put cqx again error ...') else: self.res_handle(ged)
def __del__(self): self.fail.close() self.cra_d.close() logger.info('process running down ...') '''
def paser_page( kwds): realtime = [] spacetime = [] lname = [] sname = [] type = [] url_res = [] coordinates = [] dl = download() task = kwds key_l = { 'lname':[],\ 'sname':[],\ 'type':[],\ 'coordinates':[],\ 'realtime':[],\ 'spacetime':[],\ 'url_res':[]\ } task_url = task['info'][0] path = task['info'][1] filename = path.split('/')[0]+'/'+md5(path.split('/')[1]) city_name = path.split('/')[0] step = task['info'][2] exp_act = task['exp_act'][step] spi_act = task['spi_act'][step] temp = dl.temp count = 0 flag = True p = '0.0.0.0:0' for key, value in exp_act.items(): if count == 0: i = 0 while key_l[key] == []: i += 1 if i > MAX: logger.info('a task fail ::%s',str(task['info'])) flag = False break try: if os.path.exists(filename) and Islocal: with open(filename,'r') as file : page = file.read() else: if spi_act['way'].lower() == 'req': page,p = dl.load_by_request(spi_act,task_url,filename) elif spi_act['way'].lower() == 'mc': page,p = dl.load_by_mc(spi_act,task_url,filename) exec(key+'='+value) except Exception, e: logger.error('a error spider :: <-!error::%s!-> <-!task::%s!-> <-!proxy::%s!->',traceback.format_exc(e),str(task['info']),str(p)) else: logger.info('task success :: <-!task::%s!-> <-!proxy::%s!->',str(task['info']),str(p)) if key == 'lname': key_l[key] = lname elif key == 'sname': key_l[key] = sname elif key == 'type': key_l[key] = type elif key == 'spacetime': key_l[key] = spacetime elif key == 'realtime': key_l[key] = realtime elif key == 'coordinates': key_l[key] = coordinates elif key == 'url_res': key_l[key] = url_res else: if i >= MAX: break try: exec(key+'='+value) except Exception, e: logger.error('not first validate error :: <-!error::%s!-> <-!task::%s!->',traceback.format_exc(e),str(task['info']))