def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) if len(ps) > 2000: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.compile( '<a target="_blank" href="(http://www.feelcars.com/.*?html)">', re.S).findall(ps) for u in urls: url = u log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource')
def run(self, params): log.info("Getting context info...") return { "job_id": get_current_job().id, "worker_id": get_current_worker().id, "config": get_current_config() }
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) if len(str(ps)) > 500: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = list( re.compile( '<p class="title">.*?<a href="(http:.*?)" target="_blank">', re.S).findall(ps)) for u in urls: url1 = u log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler2', { 'url': url1, 'flag': flag }, queue='jz_cj_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url, 'flag': flag }, queue='jz_cj_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) print(len(ps)) if len(ps) > 100: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.findall('<a class="newsLink" href="(.*?)">', ps) for u in urls: url = 'https://www.lynkco.com.cn' + u log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def run(self, params): redis_key_started = Queue.redis_key_started() stats = {"fetched": 0, "requeued": 0} # Fetch all the jobs started more than a minute ago - they should not # be in redis:started anymore job_ids = connections.redis.zrangebyscore(redis_key_started, "-inf", time.time() - params.get("timeout", 60)) # TODO this should be wrapped inside Queue or Worker # we shouldn't access these internals here queue_obj = Queue("default") unserialized_job_ids = queue_obj.unserialize_job_ids(job_ids) for i, job_id in enumerate(job_ids): queue = Job(unserialized_job_ids[i], start=False, fetch=False).fetch(full_data=True).data["queue"] queue_obj = Queue(queue) stats["fetched"] += 1 log.info("Requeueing %s on %s" % (unserialized_job_ids[i], queue)) # TODO LUA script & don't rpush if not in zset anymore. with connections.redis.pipeline(transaction=True) as pipeline: pipeline.zrem(redis_key_started, job_id) pipeline.rpush(queue_obj.redis_key, job_id) pipeline.execute() stats["requeued"] += 1 return stats
def main(): parser = argparse.ArgumentParser(description='Runs a task') cfg = config.get_config(parser=parser, config_type="run") cfg["is_cli"] = True set_current_config(cfg) log.info(cfg) if len(cfg["taskargs"]) == 1: params = json.loads(cfg["taskargs"][0]) else: params = {} # mrq-run taskpath a 1 b 2 => {"a": "1", "b": "2"} for group in utils.group_iter(cfg["taskargs"], n=2): if len(group) != 2: print "Number of arguments wasn't even" sys.exit(1) params[group[0]] = group[1] if cfg["async"]: ret = queue.send_task(cfg["taskpath"], params, sync=False, queue=cfg["queue"]) print ret else: worker_class = load_class_by_path(cfg["worker_class"]) job = worker_class.job_class(None) job.data = { "path": cfg["taskpath"], "params": params, "queue": cfg["queue"] } job.datestarted = datetime.datetime.utcnow() set_current_job(job) ret = job.perform() print json.dumps(ret)
def run(self, params): log.info("I/O starting") ret = self._run(params) log.info("I/O finished") return ret
def run(self, params): self.collection = connections.mongodb_jobs.mrq_jobs redis_key_started = Queue.redis_key_started() stats = { "fetched": 0, "requeued": 0 } # Fetch all the jobs started more than a minute ago - they should not be in redis:started anymore job_ids = connections.redis.zrangebyscore(redis_key_started, "-inf", time.time() - params.get("timeout", 60)) for job_id in job_ids: queue = Job(job_id, start=False, fetch=False).fetch(full_data=True).data["queue"] stats["fetched"] += 1 log.info("Requeueing %s on %s" % (job_id, queue)) # TODO LUA script & don't rpush if not in zset anymore. with connections.redis.pipeline(transaction=True) as pipeline: pipeline.zrem(redis_key_started, job_id) pipeline.rpush(Queue(queue).redis_key, job_id) pipeline.execute() stats["requeued"] += 1 return stats
def run(self, params): url = params['url'] flag = params['flag'] print(url) try: ps = crawl(url) print(len(ps)) if len(ps) > 10: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: urls = re.findall('"supdata_whereid":"(.*?)"', ps) for u in urls: url = 'http://www.changan.com.cn/news-details.shtml?whereid=%s&column_id=98' % u log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler1', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess1.query(Jz_dongfangcaifu_content).filter_by( url=url, channel_name=flag).first() print(1) info_2 = sess.query(Jz_dongfangcaifu_PageSource).filter_by( url=url).first() print(2) if not info and info_2: ps = info_2.pagesource ps_uid = info_2.uid author = re.findall('data-source="(.*?)">', ps) author = author[0] if author else None public_time = re.findall('<div class="time">(.*?)</div>', ps) public_time = public_time[0].replace('年', '-').replace( '月', '-').replace('日', '') if public_time else None content = re.compile('<!--文章主体-->(.*?)<!--原文标题-->', re.S).findall(ps) content2 = content[0] if content else None if content2: pic = re.findall('<img src="(https.*?)"', content2) pic = ';'.join(pic) content2 = content2.replace( '<img src', '[img src').replace('" />', '" /]').replace('</p>', '\n') else: pic = '' content = re.sub('<.*?>', '', content2.replace( ' ', '')).replace('\t', '').replace(' ', '').replace( '\u3000', '').replace('\n\n', '\n').replace( '本文版权为电动汽车网-电动邦所有,欢迎转载但请务必注明来源。', '').strip() title = re.findall('<h1>(.*?)</h1>', ps)[0] # tag = re.findall('<a class="fn-left".*?target="_blank">(.*?)</a>',ps) # tag = ' '.join(tag) hid = store2pg_parse(url=url, author=author, public_time=public_time, page_source=ps_uid, content=content, website_name='东方财富网', channel_name=flag, title=title, topic=None, tag=None, meta_keywords=None, pic=pic, flag=None) if hid: print('完成') else: print('新闻解析已存在') except Exception as e: print(e) if e != "'NoneType' object has no attribute 'replace'": print('重新入队') log.info('入队列 jz_cj_parse')
def run(self, params): # If there are more than this much items on the queue, we don't try to check if our mongodb # jobs are still queued. max_queue_items = params.get("max_queue_items", 1000) stats = {"fetched": 0, "requeued": 0} all_queues = Queue.all_known() for queue_name in all_queues: queue = Queue(queue_name) queue_size = queue.size() if queue.is_raw: continue log.info("Checking queue %s" % queue_name) if queue_size > max_queue_items: log.info("Stopping because queue %s has %s items" % (queue_name, queue_size)) continue queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1)) if len(queue_jobs_ids) >= max_queue_items: log.info( "Stopping because queue %s actually had more than %s items" % (queue_name, len(queue_jobs_ids))) continue for job_data in connections.mongodb_jobs.mrq_jobs.find( { "queue": queue_name, "status": "queued" }, projection={ "_id": 1 }).sort([["_id", 1]]): stats["fetched"] += 1 if str(job_data["_id"]) in queue_jobs_ids: log.info("Found job %s on queue %s. Stopping" % (job_data["_id"], queue.id)) break # At this point, this job is not on the queue and we're sure # the queue is less than max_queue_items # We can safely requeue the job. log.info("Requeueing %s on %s" % (job_data["_id"], queue.id)) stats["requeued"] += 1 job = Job(job_data["_id"]) job.requeue(queue=queue_name) return stats
def run(self, params): log.info("adding", params) res = params.get("a", 0) + params.get("b", 0) if params.get("sleep", 0): log.info("sleeping", params.get("sleep", 0)) time.sleep(params.get("sleep", 0)) return res
def run(self, params): log.info("pipeline..........") # {'processor': item.processor, 'request': item.request} pipeline = params.get('pipeline', None) result = params.get('result', None) if pipeline is not None: clazz = PIPEINE_MAP.get(pipeline) clazz().process_item(result) log.info('--------------------complete')
def run(self, params): log.info("adding", params) res = params.get("a", 0) + params.get("b", 0) if params.get("sleep", 0): log.info("sleeping %d", params.get("sleep", 0)) time.sleep(params.get("sleep", 0)) return res
def run(self, params): # If there are more than this much items on the queue, we don't try to check if our mongodb # jobs are still queued. max_queue_items = params.get("max_queue_items", 1000) stats = { "fetched": 0, "requeued": 0 } all_queues = Queue.all_known() for queue_name in all_queues: queue = Queue(queue_name) queue_size = queue.size() if queue.is_raw: continue log.info("Checking queue %s" % queue_name) if queue_size > max_queue_items: log.info("Stopping because queue %s has %s items" % (queue_name, queue_size)) continue queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1)) if len(queue_jobs_ids) >= max_queue_items: log.info( "Stopping because queue %s actually had more than %s items" % (queue_name, len(queue_jobs_ids))) continue for job_data in connections.mongodb_jobs.mrq_jobs.find({ "queue": queue_name, "status": "queued" }, projection={"_id": 1}).sort([["_id", 1]]): stats["fetched"] += 1 if str(job_data["_id"]) in queue_jobs_ids: log.info("Found job %s on queue %s. Stopping" % (job_data["_id"], queue.id)) break # At this point, this job is not on the queue and we're sure # the queue is less than max_queue_items # We can safely requeue the job. log.info("Requeueing %s on %s" % (job_data["_id"], queue.id)) stats["requeued"] += 1 job = Job(job_data["_id"]) job.requeue(queue=queue_name) return stats
def run(self, params): log.info("Will abort this task") connections.mongodb_jobs.tests_inserts.insert(params) try: raise InAbortException except InAbortException: abort_current_job() raise Exception("Should not be reached")
def run(self, params): log.info("Retrying in %s on %s" % (params.get("delay"), params.get("queue"))) connections.mongodb_jobs.tests_inserts.insert(params) retry_current_job(queue=params.get("queue"), delay=params.get("delay"), max_retries=params.get("max_retries")) raise Exception("Should not be reached")
def run(self, params): log.info("Retrying in %s on %s" % (params.get("countdown"), params.get("queue"))) connections.mongodb_logs.tests_inserts.insert(params) if params.get("cancel_on_retry"): self.cancel_on_retry = params.get("cancel_on_retry") retry_current_job(queue=params.get("queue"), countdown=params.get("countdown")) raise Exception("Should not be reached")
def run(self, params): log.info("crawl..........%s") # {'processor': item.processor, 'request': item.request} processor = params.get('processor', None) request = params.get('request', None) if processor is not None: clazz = load_class('processors', processor) processor_instance = clazz() if request is not None: request = request_from_dict(request, processor_instance) # print(request) processor_instance.set_start_requests([request]) SpiderCore(processor_instance, time_sleep=1).start() log.info('****************complete')
def run(self, params): key_name = params["key_name"] log.info("Opening file to extract exif for %s", key_name) #Use exifread libary to extract exif data from image file f = open(key_name) exif_data = process_file(f, details=False) log.info("Extracted exif data") #Delete the file f.close() os.remove(key_name) #Only extract data needed from libary call to store in database tags = {} for field_name in exif_data: field = exif_data[field_name] tags[field_name] = { 'printable': str(field), 'tag': field.tag, 'field_type': field.field_type, 'field_length': field.field_length, 'values': str(field.values) } #Store dictionary of tags into mongodb instance log.info("Inserting tags into db") exif_store.insert_one(tags) log.info("Successfully inserted tags into db")
def run(self, params): log.info("Retrying in %s on %s" % (params.get("delay"), params.get("queue"))) connections.mongodb_jobs.tests_inserts.insert(params) retry_current_job( queue=params.get("queue"), delay=params.get("delay"), max_retries=params.get("max_retries") ) raise Exception("Should not be reached")
def run(self, params): #初始化的时候,选用页面长度为30 #每日增量,只要一页就可以 url_s1 = ('http://www.feelcars.com/category/xinnengyuan/page/%s', '新能源') end = 4 for i in range(1, end): url = url_s1[0] % str(i) log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler1', { 'url': url, 'flag': url_s1[-1] }, queue='jz_qckj_pagesource')
def run(self, params): url = params.get('ext', '').strip() topic = params.get('topic', '').strip() message = params.get('message', '').strip() ext_url = urlencode({'topic': topic, 'message': message}) if '?' not in url: url = url + '?' + ext_url else: url = url + ext_url if url.endswith('&') else url + '&' + ext_url log.info('HTTP GET %s' % (url, )) res = requests.get(url) if res.ok: return res.content else: retry_current_job()
def run(self, params): host = params.get('h', '').strip() port = params.get('p', 80) hkey = '%s:%d' % (host, port) if _check_t(params): abort_current_job() connections.redis.sadd(CONF_DATA_ALL_KEY, hkey) tmp = connections.redis.hget(CONF_DATA_RANK_KEY, hkey) now_num = int(tmp) if tmp else 0 if not now_num: connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, 1) test = CONF_CHECK_PROXY_FUNC(host, port) test and log.info('CHECK OK proxy:%s, num:%d' % (hkey, now_num)) if test: if now_num <= 0: now_num = 1 if connections.redis.sismember( CONF_DATA_OK_KEY, hkey) else 10 connections.redis.hset(CONF_DATA_RANK_KEY, hkey, now_num) elif 0 < now_num < 20: connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, 1) now_num += 1 else: if now_num >= -10: connections.redis.hincrby(CONF_DATA_RANK_KEY, hkey, -1) now_num -= 1 if now_num > 0: connections.redis.sadd(CONF_DATA_OK_KEY, hkey) else: connections.redis.srem(CONF_DATA_OK_KEY, hkey) now_num <= -10 and connections.redis.srem(CONF_DATA_ALL_KEY, hkey) return {'proxy': hkey, 'num': now_num, 'test': test}
def run(self, params): params1 = ( 'https://www.lynkco.com.cn/Brand/News/NewsMore?pageIndex=%s', '新闻潮讯') #初始化的时候,选用页面长度为30 for u in [params1]: if u[-1] == '新闻潮讯': end = 8 for i in range(1, end): url = u[0] % str(i) log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler1', { 'url': url, 'flag': u[-1] }, queue='jz_qymh_pagesource')
def run(self, params): key_name = params["key_name"] _, extension = os.path.splitext(key_name) if (extension == ".jpg"): conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY) bucket = conn.get_bucket(params["bucket_name"]) key = bucket.get_key(key_name) key.get_contents_to_filename(key_name) log.info("Succesfully downloaded file from s3 bucket %s", key_name) queue_job("tasks.Write", {"key_name": key_name}, queue=write_queue) else: #TODO handle compressed and other file types log.warn( "Currently unable to handle file extension type for file %s", key_name) os.remove(key_name)
def run(self, params): params1 = ( 'https://www.changan.com.cn/news-changan?page=%s&year=%s&keyword=&type=0&ajax_req=1&t=1584689024944', '长安动态') params2 = ('http://www.changan.com.cn/company.shtml', '合资合作') #初始化的时候,选用页面长度为30 for u in [params1]: if u[-1] == '长安动态': year = datetime.datetime.now().strftime('%Y') #每周一次,一次一页(8篇) for page in range(1, 2): url = u[0] % (str(page), str(year)) log.info('入队列 jz_qymh_pagesource') queue_job('main_changanqiche.Crawler1', { 'url': url, 'flag': u[-1] }, queue='jz_qymh_pagesource')
def crawl(url): '''抓取网页源码pageSource''' header = header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Host': 'www.caam.org.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' # 'Cookie': '__xwaf_id=e52bf9be294d90397354ab6d12a689eaa6fbbfae12e1338e0bac7e8b2b179e78; __xwaf_browser_auth=BkPxoimjrj8Xi8GRntg8Lw==; __xwaf_filter_key=57fdc8dd987c1627' } session = requests.session() try: ipusing = get_proxy_redis() ipusing = str(ipusing, encoding='utf-8') # _proxy = {'http':'http://%s'%ipusing,'https':'https://%s'%ipusing} log.info('now using %s' % ipusing) data = session.get(url, headers=header, timeout=30) print("%s's status_code is %s" % (url, data.status_code)) # 打印相关url 的状态码 if data.status_code == 200: data.encoding = data.apparent_encoding pageSource = data.text data.close() return pageSource elif data.status_code == 404: return '404' except Exception: pass finally: time.sleep(random.uniform(0, 2))
def run(self, params): #初始化的时候,选用页面长度为30 #每日增量,只要一页就可以 url_s1 = ('http://finance.eastmoney.com/news/cjjsp_%s.html', '经济时评') url_s2 = ('http://finance.eastmoney.com/news/cgnjj_%s.html', '国内经济') url_s3 = ('http://finance.eastmoney.com/news/cgjjj_%s.html', '国际经济') # for i in range(1,4): for i in range(1, 26): url1 = url_s1[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url1, 'flag': url_s1[-1] }, queue='jz_cj_pagesource') url2 = url_s2[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url2, 'flag': url_s2[-1] }, queue='jz_cj_pagesource') url3 = url_s3[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', { 'url': url3, 'flag': url_s3[-1] }, queue='jz_cj_pagesource')
def run(self, params): collection = connections.mongodb_jobs.mrq_jobs # If there are more than this much items on the queue, we don't try to check if our mongodb # jobs are still queued. max_queue_items = params.get("max_queue_items", 1000) stats = {"fetched": 0, "requeued": 0} for job_data in collection.find({ "status": "queued" }, fields={ "_id": 1, "queue": 1 }).sort([("_id", 1)]): stats["fetched"] += 1 queue = Queue(job_data["queue"]) queue_size = queue.size() if queue_size > max_queue_items: log.info("Stopping because queue %s has %s items" % (queue, queue_size)) break queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1)) if len(queue_jobs_ids) >= max_queue_items: log.info( "Stopping because queue %s actually had more than %s items" % (queue, len(queue_jobs_ids))) break if str(job_data["_id"]) in queue_jobs_ids: log.info("Stopping because we found job %s in redis" % job_data["_id"]) break # At this point, this job is not on the queue and we're sure # the queue is less than max_queue_items # We can safely requeue the job. log.info("Requeueing %s on %s" % (job_data["_id"], queue.id)) stats["requeued"] += 1 job = Job(job_data["_id"]) job.requeue(queue=job_data["queue"]) return stats
def run(self, params): redis_key_started = Queue.redis_key_started() stats = { "fetched": 0, "requeued": 0 } # Fetch all the jobs started more than a minute ago - they should not # be in redis:started anymore job_ids = connections.redis.zrangebyscore( redis_key_started, "-inf", time.time() - params.get("timeout", 60)) # TODO this should be wrapped inside Queue or Worker # we shouldn't access these internals here queue_obj = Queue("default") unserialized_job_ids = queue_obj.unserialize_job_ids(job_ids) for i, job_id in enumerate(job_ids): queue = Job(unserialized_job_ids[i], start=False, fetch=False).fetch( full_data=True).data["queue"] queue_obj = Queue(queue) stats["fetched"] += 1 log.info("Requeueing %s on %s" % (unserialized_job_ids[i], queue)) # TODO LUA script & don't rpush if not in zset anymore. with connections.redis.pipeline(transaction=True) as pipeline: pipeline.zrem(redis_key_started, job_id) pipeline.rpush(queue_obj.redis_key, job_id) pipeline.execute() stats["requeued"] += 1 return stats
def crawl(url): '''抓取网页源码pageSource''' header = get_header() session = requests.session() try: ipusing = get_proxy_redis() ipusing = str(ipusing, encoding='utf-8') # _proxy = {'http':'http://%s'%ipusing,'https':'https://%s'%ipusing} log.info('now using %s' % ipusing) data = session.get(url, headers=header, timeout=30) print("%s's status_code is %s" % (url, data.status_code)) # 打印相关url 的状态码 if data.status_code == 200: data.encoding = data.apparent_encoding pageSource = data.text data.close() return pageSource elif data.status_code == 404: return 404 except Exception: pass finally: time.sleep(random.uniform(0, 2))
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_qichetansuowang_PageSource).filter_by( url=url).first() sess.rollback() if not info: ps = crawl(url) if len(str(ps)) > 2000: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_qckj_parse') queue_job('main_qichetansuowang.Parse', { 'url': url, 'flag': flag }, queue='jz_qckj_parse') else: print('新闻已存在,并入解析') queue_job('main_qichetansuowang.Parse', { 'url': url, 'flag': flag }, queue='jz_qckj_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qckj_pagesource') queue_job('main_qichetansuowang.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qckj_pagesource')
def run(self, params): # Some systems may be configured like this. if not PY3 and params.get("utf8_sys_stdout"): import codecs import sys UTF8Writer = codecs.getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) if params["class_name"] == "unicode": log.info(u"caf\xe9") elif params["class_name"] == "string": log.info("cafe") elif params["class_name"] == "latin-1": log.info("caf\xe9") elif params["class_name"] == "bytes1": log.info("Mat\xc3\xa9riels d'entra\xc3\xaenement") return True
def run(self, params): # Some systems may be configured like this. if params.get("utf8_sys_stdout"): import codecs import sys UTF8Writer = codecs.getwriter('utf8') sys.stdout = UTF8Writer(sys.stdout) if params["class_name"] == "unicode": log.info(u"caf\xe9") elif params["class_name"] == "string": log.info("cafe") elif params["class_name"] == "latin-1": log.info("caf\xe9") elif params["class_name"] == "bytes1": log.info("Mat\xc3\xa9riels d'entra\xc3\xaenement") return True
def run(self, params): self.collection = connections.mongodb_jobs.mrq_jobs # If there are more than this much items on the queue, we don't try to check if our mongodb # jobs are still queued. max_queue_items = params.get("max_queue_items", 1000) stats = { "fetched": 0, "requeued": 0 } for job_data in self.collection.find({ "status": "queued" }, fields={"_id": 1, "queue": 1}).sort([("_id", 1)]): stats["fetched"] += 1 queue = Queue(job_data["queue"]) queue_size = queue.size() if queue_size > max_queue_items: log.info("Stopping because queue %s has %s items" % (queue, queue_size)) break queue_jobs_ids = set(queue.list_job_ids(limit=max_queue_items + 1)) if len(queue_jobs_ids) >= max_queue_items: log.info("Stopping because queue %s actually had more than %s items" % (queue, len(queue_jobs_ids))) break if str(job_data["_id"]) in queue_jobs_ids: log.info("Stopping because we found job %s in redis" % job_data["_id"]) break # At this point, this job is not on the queue and we're sure the queue is less than max_queue_items # We can safely requeue the job. log.info("Requeueing %s on %s" % (job_data["_id"], queue.id)) stats["requeued"] += 1 job = Job(job_data["_id"]) job.requeue(queue=job_data["queue"]) return stats
def run(self, params): filename = params.get('f', '').strip() if _check_t(params): abort_current_job() timer_num = 3 timer_seq = CONF_CHECK_INTERVAL gql = pyfile.load_str(filename).strip() if not gql: abort_current_job() proxy_list, gret = run_gdom_page( gql, get_proxy=lambda: connections.redis.srandmember(CONF_DATA_OK_KEY)) proxy_list and log.info('FETCH OK filename:%s, num:%d' % (filename, len(proxy_list))) if gret.errors: log.error('FETCH ERROR filename:%s, errors:%s' % (filename, gret.errors)) if not proxy_list: log.error('FETCH EMPTY filename:%s, gret:%r' % (filename, gret)) abort_current_job() timestamp = int(time.time()) task_map = {} for proxy_str in proxy_list: host = proxy_str.split(':', 1)[0] port = int(proxy_str.split(':', 1)[1]) for t_idx in range(timer_num): next_tick = timestamp + pyutils.crc32_mod( proxy_str, timer_seq) + t_idx * timer_seq rawparam = '%s#%d#%d#%d' % (host, port, timer_seq, int(next_tick / timer_seq)) task_map.setdefault(rawparam, next_tick) queue_raw_jobs('check_proxy_timed_set', task_map) return { 'file': filename, 'num': len(proxy_list), 'proxy_list': proxy_list }
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_dongfangcaifu_PageSource).filter_by( url=url).first() sess.rollback() if not info: ps = crawl(url) if len(str(ps)) > 500 and '返回' not in str(ps): uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_cj_parse') queue_job('main_dongfangcaifu.Parse', { 'url': url, 'flag': flag }, queue='jz_cj_parse') else: print('新闻已存在') log.info('入队列 jz_cj_parse') queue_job('main_dongfangcaifu.Parse', { 'url': url, 'flag': flag }, queue='jz_cj_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler2', { 'url': url, 'flag': flag }, queue='jz_cj_pagesource')
def run(self, params): url = params['url'] flag = params['flag'] try: info = sess.query(Jz_lingkeqiche_PageSource).filter_by( url=url).first() if not info: ps = crawl(url) if len(ps) > 100: uid = store2pg(ps=ps, url=url, flag=flag) else: uid = None if uid: log.info('入队列 jz_qymh_parse') queue_job('main_lingkeqiche.Parse', { 'url': url, 'flag': flag }, queue='jz_qymh_parse') else: print('新闻已存在') log.info('入队列 jz_qymh_parse') queue_job('main_lingkeqiche.Parse', { 'url': url, 'flag': flag }, queue='jz_qymh_parse') except Exception as e: print(e) print('重新入队') log.info('入队列 jz_qymh_pagesource') queue_job('main_lingkeqiche.Crawler2', { 'url': url, 'flag': flag }, queue='jz_qymh_pagesource')
def run(self, params): params = params['post_data'] # params = params['data'] log.info(params) # 项目是否已经存在 same_job = has_same_job(params) log.info('has same job ?', same_job) if same_job: if same_job['status'] == FINISH_STATUS and same_job[ 'url'] != '' and same_job['total_price'] != '': # 拷贝一份 same_job['new_guid'] = copy_same_job(same_job, params) return { 'data': same_job, 'message': 'has the same job', 'status': 0 } # 添加新任务 job_guid = insert_new_job(params) self.connect() from packing.models import Project, PackDetail # 查看是否有重复计算 project = Project.objects.filter(data_input=params['data']).last() if project: log.info('has the same project data') total_price = 0 all_products = project.products.all() if project.comment != params['comment']: # 描述不一样,新增一个 project.comment = params['project_comment'] project.pk = None project.save() for product in all_products: total_price += total_price + product.total_price project.products.add(product) else: for product in all_products: total_price += total_price + product.total_price url = '%s/product_detail/%d' % (HOST_URL, project.id) # 更新任务状态 update_job_status(job_guid, FINISH_STATUS, url=url, price=total_price) insert_job_result(job_guid, all_products) return { 'data': { 'project_id': project.id, 'url': url, 'price': total_price }, 'message': 'the project had been done', 'status': 0 } res = shape_use(params) if res['is_error']: log.error(res['error_info']) update_job_status(job_guid, res['error_info']) return {'data': '', 'status': 10, 'message': res['error_info']} else: # 保存结果 # 更新任务中间状态 # update_job_status(job_guid, u'正在保存结果') log.info('saving the result into project') try: project, total_price = save_project(Project, PackDetail, res['data'], params) except Exception as e: log.error(e) # 更新任务失败状态 update_job_status(job_guid, u'保存结果失败') return { 'data': res, 'message': 'error in save the result into project', 'status': 100 } log.info('update job status and finish') # 完结任务状态 url = '%s/product_detail/%d' % (HOST_URL, project.id) # 更新任务状态 update_job_status(job_guid, FINISH_STATUS, url=url, price=total_price) insert_job_result(job_guid, project.products.all()) if project: res['new_project_id'] = project.id res['total_price'] = total_price return {'data': res, 'message': 'OK', 'status': 0}