def retry(self, url, retries, func, code=None): responset = self.startRequest(url=url, retries=retries) responset = responset.content.decode("utf8", "ignore") responset = json.loads(responset) webcode = jsonpath.jsonpath(responset, "$..code")[0] self.Logger.info([url, "msg %d" % webcode]) if webcode < 0 and retries < 3: if webcode == -403: print(webcode, "Need login. Wait 5 sec") time.sleep(5) Ey.RoboEasyLogin(self.key) print("Retry Login...%d " % retries) time.sleep(5) self.cookie = Ey.getCookie(self.key) func(retries=retries + 1, code=code) else: p = random.randint(5, 15) self.Logger.error( ("ErrorCode %d :Sleep %d sec ..." % (webcode, p))) time.sleep(p) func(retries=retries + 1, code=code) if retries == 3 and webcode < 0: raise ValueError("HTTPERROR OVER MAX RETRY TIME") print("SuccessCode %d " % webcode) return responset
def parse_detail(self, url, specify=0, **kwargs): """ :param url: :param specify: 1:针对特定指标名进行爬取,0:否 :return: """ from lxml import html item = CommonItem.commonItem() if specify not in (1, 0): raise ValueError("no such specify values! It must be 1 or 0") if specify == 1: self.headers = { 'cookie': giveCookie(method='set', code=self.channelname[1]), 'user-agent': userAgent.user_agent, } response = self.Request(url=url, method='GET', callback=None) tree = html.fromstring(response) time_list = tree.xpath(xpathRules.xtime) time_list = map(EasyMethod.fuckMonthEnd, time_list) objname = tree.xpath(xpathRules.xobj)[1:] objname = map(lambda x: x.xpath("string()"), objname) unit = tree.xpath(xpathRules.xunit) unit = map(lambda x: re.sub(r"[\[\]]", "", x), unit) if specify: obj_num = dict(zip(objname, range(len(objname)))) obj = kwargs['objname'].split(":")[-1] nums = [obj_num[obj.decode("utf8")]] del obj_num else: nums = range(len(objname)) for n in nums: value = tree.xpath(xpathRules.xdata.format(n + 3)) data = EasyMethod.KeepNum(dict(zip(time_list, value))) item.data = data if specify == 1: item.objname = kwargs['objname'] else: item.objname = "中国投资:" + kwargs['channelname'] + ":" + objname[ n] item.unit = unit[n] item.plat = 6 item.freq = 4 item.mode = { "mode": "Z", "url": url, "code": self.channelname[1], "name": self.channelname[0] } yield item()
def parseMarketSeason(self, url_suffix, mod=2, **kwargs): if mod < 0 or mod > 4 or not isinstance(mod, int): raise ValueError("mod取值只可以是0到4的整数值") if mod < 2: step, freq = 3, 5 elif mod == 2: step, freq = 1, 4 else: step, freq = 6, 5 this = self.allow_domains[0] + url_suffix objprefix = "易车指数##市场大盘##份额趋势(近%d个月均值)" % step + "##%s" suffix = url_suffix.split("/")[-1] param = {"timeType": "month"} for year in (2018, 2017): for month in range(0, 13, step): try: tmp = "tmpyiche:" + suffix + ":%s" if month + step > 12: continue param['fromTime'] = "%d-%02d-01" % (year, month) param['toTime'] = EasyMethod.fuckMonthEnd(year=year, month=month + step) response = Yiche.startRequest(url=this, data=param) # type 表示 类型 if kwargs["type"] == 1: objname = jsonpath(response, "$..series[*].data[*].name") objdata = jsonpath(response, "$..series[*].data[*].symbolSize") if objdata == False: objdata = jsonpath(response, "$..series[*].data[*].value") elif kwargs["type"] == 0: objname = jsonpath(response, "$..yAxis[*].data")[0] objdata = jsonpath(response, "$..series[*].data")[0] print(objdata) map( lambda a, b: self._Rconn.hset(tmp % a, param['toTime'], b.decode("utf8")) if b else 1, objname, objdata) except Exception as e: print(e) for k in self._Rconn.keys("tmpyiche:%s*" % suffix): data = self._Rconn.hgetall(k) objname = objprefix % k.split(":")[-1] yield { "objname": objname, "data": data, "unit": "%", "freq": freq }, { "param": param } print("delete %s" % k) self._Rconn.delete(k)
def initAccount(): from middles.middleAssist import ssdbAssist from middles.middleWare import EasyMethod t = ssdbAssist.SshSSDB().connect() n = eval(t.get("robo:uname")) # n = ['zzm','com', 'cwf', 'fwb','llb'] # t = imredis().connection() for i in n: EasyMethod.RoboEasyLogin(i, 2)
def __init__(self, hkey="Robo"): name = "RoboSpider" self.start_urls = 'https://gw.datayes.com/rrp_adventure/web/supervisor/macro/level/0' self.urlContainer = [ 'https://gw.datayes.com/rrp_adventure/web/supervisor/macro/%s', 'https://gw.datayes.com/rrp_adventure/web/dataCenter/indic/%s?compare=false' ] self.key = hkey self.cookie = Ey.getCookie(self.key) self.Logger = logAsisst.imLog(sys.argv[1])()
def Tubes(self, taskinfo): import datetime print taskinfo try: self.plat_id = taskinfo["plat_id"] code = eval(taskinfo["obj_ext"]) mode = code['mode'] dataflow = self.ModeOption(mode=mode, objname=taskinfo['obj_name']) taskinfo['report_time'] = '%s' % \ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') taskinfo["data"] = EasyMethod.KeepNum(dataflow["data"]) taskinfo['process_code'] = os.getpid() return taskinfo except Exception as e: print(e) self.Logger.error(["TubesError[%d]" % os.getpid(), e])
def parse(self, code, **kwargs): """ :param code: 节点代码 :param kwargs: retries = 0 retries must be equal zero :return: item object """ response = self.retry(url=self.urlContainer[1] % code, retries=kwargs["retries"], code=code, func=self.parse) items = RoboItem() try: dateValue = jsonpath.jsonpath(response, "$..periodDate") #dateValue = map(Ey.fuckAntiNum, dateValue) #print(jsonpath.jsonpath(response, "$..dataValue")) items.data = dict( zip(dateValue, jsonpath.jsonpath(response, "$..dataValue"))) except Exception as e: print(e) items.data = {} source_msg = jsonpath.jsonpath(response, "$..indic")[0] del response items.update_time = source_msg["updateTime"] items.unit = source_msg["unit"] items.is_end = 1 if source_msg["isUpdate"] == False else 0 items.start_time = source_msg["beginDate"] items.end_time = source_msg["periodDate"] items.source = source_msg["dataSource"] items.frequency = Ey.frequency2id(source_msg["frequency"]) items.value = source_msg["frequency"] if items.frequency == 100 else "" items.ext = { "region": source_msg["region"], "country": source_msg["country"], "name": source_msg["indicName"] } ## 'pcode' var xiangdangyu 'note' field items.pcode = source_msg["statType"] #print(items()) return items()
def parseSales(self, code, name, **kwargs): if kwargs['pid'] > 6: yield 0 param9 = {"id": kwargs["pid"], "value": code} url = self.allow_domains[0] + self.obj_urls[9] response = Yiche.startRequest(url, data=param9) objdetail = jsonpath(response, "$..thead[*].name")[0] objtime = objdetail[0] objname = objdetail[2] objtime = EasyMethod.fuckMonthEnd(re.sub("[^0-9]", "", objtime)) objdata = jsonpath(response, "$..tbody")[0] for obj in objdata: yield { "objname": "%s##%s##%s" % (name, objname, obj['name']), "data": { objtime: obj['index'] }, "unit": "辆", "freq": 4 }, { "param": param9 } self.parseSales(code=code, name=name, pid=kwargs['pid'] + 1)
Ice.loadSlice("../util/gmqi.ice") import GMQ sys.path.append("../") from util.gmqutil import sendMessagetoQueue import signal import json from gevent.queue import Queue, Empty import gevent.monkey gevent.monkey.patch_socket() from middles.middleAssist import logAsisst from tubes import RoboTubes from middles.middleWare import EasyMethod EasyMethod.RoboEasyLogin("Robo") QUEUE_IP = '10.0.0.6' QUEUE_PORT = 22345 READ_QUEUE_NAME = 'iMqDataSnatch_luobo_d' DATA_COLLECT_QUEUE_NAME = 'iMqIMDataCollect' IMMQ_PROXY = 'gmqObjectId:tcp -h 10.0.0.6 -p 22345' LOGGER_NAME = "RoboCrawlGetTask" lg = logAsisst.imLog(LOGGER_NAME)() def signal_handler(signal, frame): print('You pressed Ctrl+C!') ic.destroy() sys.exit(0)