示例#1
0
class Job51Spider(Spider):
    def dispatch(self):
        self.bs = BinSaver("job51.bin")
        for i in range(45000000, 75000000):
            self.add_main_job(i)
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):
        print "job is ", jobid
        url = "http://search.51job.com/job/%d,c.html" % jobid
        res = self.request_url(url, [])
        if re.search(u'您选择的职位目前已经暂停招聘', res.text):
            print jobid, "match nothing"
        else:
            print "saving %d ..." % jobid
            self.bs.append('51job.%d' % jobid, res.text)
示例#2
0
class CourtCAPParser(CAPParser):
    def __init__(self, channel, dist_file, name, parser):
        CAPParser.__init__(self, channel, name)
        self.bin_writer = BinSaver(dist_file)
        self.parser = parser

    def parse(self, page):
        res = self.parser.parse(page['indexUrl'], page['content'][1])
        if res:
            return [res]
        return []

    def pre_save(self, saver):
        pass

    def on_save(self, items):
        for item in items:
            print 'saving', item['name']
            self.bin_writer.append(item['name'], item['value'])
示例#3
0
class ZGcpwswSpider2(Spider2):

    def __init__(self, thcnt, need_srl=True, qf_name=None):
        Spider2.__init__(self, thcnt)
        #
        self.ce_fs = FileSaver("court_queries/check_error")
        self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid())
        self.log_fs = FileSaver("log")
        #
        self.qf_name = qf_name
        self._name = "%s"% self.qf_name.split("/")[1]
        self.srl = {}
        self.need_srl = need_srl
        pass

    def init_jobs(self):
        with open(self.qf_name) as fs:
            for line in fs:
                job = eval(line.strip())
                count = job.get("count")
                if count > ZGcpwswData.total_max_record:
                    for i in ZGcpwswData.data_order:
                        for j in ZGcpwswData.order_direction:
                            for k in range(ZGcpwswData.page_max_index):
                                copy_job = copy.deepcopy(job)
                                copy_job["jobid"]["data"]["Index"] = k + 1
                                copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                                copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j]
                                copy_job["jobid"]["data"]["Order"] = ZGcpwswData.data_order[i]
                                self.add_job(copy_job)

                elif ZGcpwswData.total_core_record < count <= ZGcpwswData.total_max_record:
                    for j in ZGcpwswData.order_direction:
                        for k in range(ZGcpwswData.page_max_index):
                            copy_job = copy.deepcopy(job)
                            copy_job["jobid"]["data"]["Index"] = k + 1
                            copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                            copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j]
                            self.add_job(copy_job)

                elif 0 < count <= ZGcpwswData.total_core_record:
                    for k in range(ZGcpwswData.page_max_index):
                        copy_job = copy.deepcopy(job)
                        copy_job["jobid"]["data"]["Index"] = k + 1
                        copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                        self.add_job(copy_job)

        print "=======finish loading job======"

    def run_job(self, jobid):
        time.sleep(0.1)
        if isinstance(jobid, dict):
            url = jobid.get("jobid").get("url")
            data = jobid.get("jobid").get("data")
            headers = jobid.get("jobid").get("headers")
            reg_count = int(jobid.get("count"))
            resp = None
            try:
                if self.need_srl:
                    nr = self.srl.get(getattr(self._tls, 'tid', 0))
                else:
                    nr = self.get_session_request()
                    self.set_cookie_passport(nr)
                # 由于文书网系统升级,所以每次请求前需要再请求两次,用于获取cookie passport
                resp = nr.request_url(url, data=data, headers=headers)
                if isinstance(resp, CurlReq.Response) and resp and resp.content:
                    result_list = json.loads(json.loads(resp.content))
                    if result_list:
                        # for record
                        ZGcpwswData.set_doc_count(data, len(result_list) - 1, self.log_fs)
                        # for record
                        for result in result_list:
                            if result.get("Count"):
                                new_count = int(result.get("Count"))
                                if new_count > reg_count:
                                    jobid["check_count"] = new_count
                                    self.ce_fs.append(json.dumps(jobid, ensure_ascii=False))
                            else:
                                name = '%s.%d' % (result.get(ZGcpwswData.doc_id), int(time.time()) )
                                self.docbin_fs.append(name, json.dumps(result, ensure_ascii=False))
                    else:
                        pass
                else:
                    # owing to network, return None, add to job
                    pass
            except Exception, e:
                # print "%s-%s"%(resp.text, data)
                pass

            time.sleep(1)
            self.re_add_job(jobid)
示例#4
0
class JobuiSpider(Spider):
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("../spider/proxy/proxy.txt")
        Spider.__init__(self, len(self.proxies_dict))
        self.num_count = 0
        self.__fail_ids = FileSaver("fail_ids.txt")

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        self.bs = BinSaver("jobui_job.bin")
        i = 133002626
        while i > 130000000:
            #131127307  131207901
            job = {"id": i, "retry_none": 0, "retry_500": 0}
            self.add_job(job, True)
            i -= 1
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls, 'failcount', 0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc

    def run_job(self, jobid):
        jobid_int = jobid.get("id")
        url = "http://www.jobui.com/job/%d/" % (jobid_int)
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies)
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.num_count += 1

        #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code))

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 404:
            print "%d ------ 404" % jobid_int
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d ------ %d " % (jobid_int, res.code)
            self.add_job(jobid)
            time.sleep(0.8)
            return
        elif res.code == 200:
            print "%d ------ saving " % jobid_int
            fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time()))
            self.bs.append(fn, res.text)
            if self.bs.getsize() >= 8 * 1024 * 1024 * 1024:
                raise AccountErrors.NoAccountError('file too large')
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            Log.error("unknown error...")
            Log.errorbin("%s" % jobid_int, res.text)
            raise AccountErrors.NoAccountError('fatal error')

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I)
                if m:
                    prstr = m.group(1)
                    proxies = {
                        'http': 'http://' + prstr + "/",
                        'https': 'https://' + prstr + "/"
                    }
                    self.proxies_dict.append(proxies)
                elif re.match('\s*#', line):
                    continue
        print " loaded [ %d ] proxis " % len(self.proxies_dict)
示例#5
0
class PageStoreBase(PageStoreDB):
    class CurDoc(object):
        def __init__(self, content, getime, jdid, real_url):
            self.cur_content = content
            self.cur_getime = getime
            self.cur_jdid = jdid
            self.cur_url = real_url

    def __init__(self, channel, dburl=None):
        super(PageStoreBase, self).__init__(channel, dburl)
        self.testmode = False
        opath = self.getopath()
        t = time.localtime()
        folder = "%s/%s/%d" % (opath, self.channel, t.tm_year)
        fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon)
        os.system("mkdir -m 777 -p " + folder)
        self._ofn = "%s/%s.bin" % (folder, fnbase)
        self._ofnlog = "%s/%s_update.log" % (folder, fnbase)
        self.fssaver = BinSaver(self._ofn)
        self._hashcheck = spider.util.LocalHashChecker()
        self._docobjtls = threading.local()
        self.saved_count = 0

    def getopath(self):
        dirs = ['/data/crawler/_files3_', '/opt/_test_store_']
        for di in dirs:
            if os.path.isdir(di) and os.access(di, os.W_OK):
                return di
        raise RuntimeError("no dir to write files.")

    def get_cur_doc(self):
        return getattr(self._docobjtls, 'doc', None)

    def set_cur_doc(self, content, getime, jdid, real_url):
        doc = PageStoreBase.CurDoc(content, getime, jdid, real_url)
        setattr(self._docobjtls, 'doc', doc)

    @staticmethod
    def mktime(year=2015, m=1, d=1, hour=0, minute=0, second=0):
        arr = [year, m, d, hour, minute, second, 0, 0, 0]
        for i in range(0, len(arr)):
            arr[i] = int(arr[i])
        return time.mktime(arr)

    def extract_content(self):
        raise NotImplementedError('virtual function called')

    def page_time(self):
        raise NotImplementedError('virtual function called')

    def check_should_fetch(self, jdid):
        indexUrl = "%s://%s" % (self.channel, jdid)
        return not self.find_new(indexUrl)

    def save_time_log(self, indexUrl, cur_tm):
        oldtime = self.get_page_time(indexUrl)
        if oldtime == cur_tm:
            return
        logstr = "%s %ld => %ld\n" % (indexUrl, oldtime, cur_tm)
        cutil.mp_append_log(self._ofnlog, logstr)

    def save(self, getime, jdid, real_url, content, fnpath=None, offset=None):
        global MIN_TIME_MSEC
        if getime > MIN_TIME_MSEC:
            raise RuntimeError("get time muse be in seconds.")
        if self._hashcheck.query(jdid) > 0:
            return True
        self.set_cur_doc(content, getime, jdid, real_url)

        try:
            pageDesc = self.extract_content()
            if not pageDesc:
                print "jdid: %s, pageDesc empty" % self.get_cur_doc().cur_jdid
                return False
            elif self.testmode:
                print pageDesc
            pageTime = self.page_time()
            if pageTime is None or pageTime < MIN_TIME_MSEC:
                raise RuntimeError("page time must be in msec")
            if isinstance(pageTime, float):
                pageTime = int(pageTime)
            if isinstance(pageDesc, unicode):
                pageDesc = pageDesc.encode('utf-8')
            contentSign = hashlib.md5(pageDesc).hexdigest()
            indexUrl = "%s://%s" % (self.channel, jdid)

            self.save_time_log(indexUrl, pageTime)
            # if there is an entry with this contentSign, update it with no need to save webpage in binfile.
            # otherwise update by indexUrl.
            if self.find_item(indexUrl, contentSign):
                Log.warning("%s exists in db, skip" % jdid)
                self.update_time(indexUrl, contentSign,
                                 int(getime) * 1000, pageTime)
                return True
            print "saving", indexUrl
            odoc = {
                'contentSign': contentSign,
                'indexUrl': indexUrl,
                'realUrl': real_url,
                'createTimeFlag': 1,
                'owner': self.channel,
                'createTimeTimeStamp': pageTime,
                'crawlerUpdateTime': int(getime) * 1000,
                'updateTime': pageTime,
                'status': 0,
                'isUpdated': 0,
                'isExpired': 0,
            }
            if self.testmode:
                pprint.pprint(odoc)
                return True
            else:
                if self.do_save(odoc, content, fnpath, offset):
                    print indexUrl, "saved"
                    self.saved_count += 1
                    self._hashcheck.add(jdid)
                    return True
                return False
        except Exception as e:
            print e
            traceback.print_exc()
            Log.error("failed to save %s %s" % (self.channel, jdid))
            time.sleep(5)
            return False

    def do_save(self, odoc, content, fnpath=None, offset=None):
        if isinstance(content, unicode):
            content = content.encode('utf-8')
        filepos = self.fssaver.append(
            "%s.%s.%d" % (self.channel, self.get_cur_doc().cur_jdid,
                          self.get_cur_doc().cur_getime), content)
        odoc.update({'pageContentPath': "binf::%s::%d" % (self._ofn, filepos)})
        return self.upsert_doc(odoc['indexUrl'], odoc)
示例#6
0
class QycxbSpider(Spider):
    def __init__(self, threadcnt):
        Spider.__init__(self, threadcnt)
        self.sqs = {}
        self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin")

    def init_req(self):
        with self.locker:
            threadident = str(threading.currentThread().ident)
            sq = QycxbReq()
            # sq.load_proxy("../../_zhilian/curproxy0")
            # sq.load_proxy("../_zhilian/curproxy")
            # sq.select_user_agent("firefox")
            sq.default_headers = {"Connection": "keep-alive",
                                  "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                                  "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                                  "Accept-Encoding": "gzip, deflate",
                                  "Referer":"http://qiye.qianzhan.com/",
                                  "X-Requested-With":"XMLHttpRequest",
                                  "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0",
                                  "Pragma":"no-cache",
                                  "Cache-Control":"no-cache",
                                  "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8"}
            # con = sq.request_url("http://qiye.qianzhan.com/")
            con1 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList",
                                  data=r"oc_name=%E5%B9%BF%E5%B7%9E%E5%B8%82%E5%8D%97%E6%B2%99%E5%8D%8E%E5%B7%A5%E7%A0%94%E7%A9%B6%E9%99%A2&oc_area=&sh_searchType=1&od_orderby=0&page=1&pageSize=10")
            self.sqs[threadident] = sq
            setattr(self._curltls, "sq", sq)
            return sq

    def dispatch(self):
        f = open("/home/peiyuan/r1.txt", "rb")
        currline = 0
        skip = 0
        endline = 1000
        while currline < skip:
            line = f.readline()
            currline += 1

        while currline < endline:
            line = f.readline()
            key = line.strip().split(" ")[-1].strip()
            job = {"key": key, "type": "u1", "lineno": currline}
            self.add_main_job(job)
            currline += 1
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, job):
        time.sleep(5)
        threadident = str(threading.currentThread().ident)
        sq = getattr(self._curltls, "sq",None)
        if sq is None:
            sq = self.init_req()
        Log.info("Running job:" + util.utf8str(job.__str__()))
        if job["type"] == "u1":
            Log.info("Searching line %d" % job["lineno"])
            con = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList",
                                 data={"oc_name": job["key"], "od_orderby": 0, "page": 1,
                                       "pageSize": 10, "oc_area": "",
                                       "sh_searchType": 1})
            if con is None or con.text.strip() == "" or con.code != 200:
                Log.error("[u1]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = ""
                try:
                    jsonobj = json.loads(con.text.strip())
                except ValueError as e:
                    Log.error("Json decode error. String is %s" % con.text)
                    return
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error("[u1]Request fail, succ flag is False. JOB=>" + util.utf8str(job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    corplist = jsonobj["lst"]
                    if len(corplist) == 0:
                        Log.error("Search return nothing. %d:%s, no data." % (job["lineno"], job["key"]))
                        return
                    else:
                        for corp in corplist:
                            jobb = {"type": "u2", "orgCode": corp["oc_orgCode"], "name": corp["oc_name"]}
                            self.add_job(jobb)

        if job["type"] == "u2":
            Log.info("Getting detail info about %s" % job["name"])
            timestr = "%f" % time.time()
            con0 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s" % (
                timestr.split(".")[1], timestr.split(".")[0]))
            if con0 is None or con0.text.strip() == "" or con0.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            if not os.path.exists(threadident):
                os.mkdir(threadident)
            f = open(threadident + "/qycxb.js", "w+b")
            f.write(r'var window = {document : {cookie :"qznewsite.uid=' + sq.get_cookie(
                    "qznewsite.uid").strip() +'"}};  ' + con0.text + "console.log(window.__qzmcf())")
            f.flush()
            os.system("nodejs " + threadident + "/qycxb.js > " + threadident + "/mcfcode.txt")
            mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip()
            con1 = sq.request_url("http://qiye.qianzhan.com/orgcompany/SearchItemDtl",
                                  data={"mcfCode": mcfcode, "orgCode": job["orgCode"]})
            if con1 is None or con1.text.strip() == "" or con1.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = json.loads(con1.text.strip())
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error(
                            "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>" + util.utf8str(
                                    job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip())
                    Log.info("%s,%s,saved." % (job["name"], job["orgCode"]))
                    return
示例#7
0
class EWTSpider(Spider):
    def __init__(self, thcnt):
        Spider.__init__(self, thcnt)
        self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0'
        self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark'
        self.headers = {'Referer': self.baseurl}
        #scores = range(450,750+1) + range(449, 0, -1) + [0]
        scores = range(750, 0, -1) + [0]
        self.possmap = {
            'Years': range(2009, 2014 + 1),
            'WL': ['l', 'w'],
            'BZ': ['b', 'z'],
            'PiCi': 0,
            'Score': scores,
            'ProvinceCode': 0,
            'page': 1
        }
        self.bs = BinSaver("fo.bin")
        self.racer = RaceValueByKey()
        ##stacktracer.trace_start("trace.html")

    def doLogin(self):
        self.cookies = {}
        self.request_url(
            'http://passport.ewt360.com/login/prelogin?callback=cb&sid=2&username=BOBOYI&password=1122333&fromurl=%2F&code=&isremember=1'
        )
        print self.cookies
        return len(self.cookies.keys())

    def dispatch(self):
        self.racer.getValue('login', lambda v: self.doLogin())
        ## load saved list.
        savedlist = {}
        try:
            with open("ks") as f:
                for lines in f:
                    savedlist[lines.strip()] = 1
        except Exception as e:
            pass
        a = AllPossibilities(self.possmap, ['WL', 'BZ', 'Years', 'Score'])
        for i in a.all():
            keys = ['WL', 'BZ', 'Years', 'Score', 'page']
            ss = "%s.%s.%s.%s.%s" % (i[keys[0]], i[keys[1]], i[keys[2]],
                                     i[keys[3]], i[keys[4]])
            if ss not in savedlist:
                self.add_job({'tp': 'mj', 'v': i}, True)
        self.wait_q()
        self.add_job(None, True)

    def dispatch2(self):
        with open('errlog.txt') as f:
            for lines in f:
                jm = json.loads(lines.strip())
                self.add_job({'tp': 'mi', 'v': jm}, True)
        self.wait_q()
        self.add_job(None, True)

    def run_job(self, jobid):
        if isinstance(jobid, dict):
            self.dump_jobid(jobid)
            xxvalue = self.racer.oldValue('login')
            con = self.request_url(self.baseurl,
                                   params=jobid['v'],
                                   headers=self.headers)
            if con is None:
                return self.run_job(jobid)
            if re.search(u'<title>登录系统</title>', con.text) or re.search(
                    u'您的账号未登陆或超时,请重新登', con.text):
                self.racer.delValueChecked('login', xxvalue)
                self.racer.sleepAlign(10)
                print "=======================relogin==================="
                self.racer.getValue('login', lambda v: self.doLogin())
                return self.run_job(jobid)
                #raise RuntimeError("need login")
            if jobid['tp'] == 'mj':
                m = re.search(ur'page=(\d+)[^<>]*>尾页', con.text)
                if m:
                    lp = int(m.group(1))
                    for page in range(2, lp + 1):
                        v2 = copy.deepcopy(jobid['v'])
                        v2['page'] = page
                        self.add_job({'tp': 'mi', 'v': v2})
            if jobid['tp'] == 'mj' or jobid['tp'] == 'mi':
                key = json.dumps(jobid['v'], ensure_ascii=0).encode('utf-8')
                self.bs.append(key, con.text)
示例#8
0
class ParseBin:
    def __init__(self):
        self.nfs = BinSaver("parsed.bin")
        self.nks = {}
        self.errlog = FileSaver("errlog.txt")

    def get_nkey(self, jn):
        keys = ['WL', 'BZ', 'Years', 'Score', 'page']
        ss = "%s.%s.%s.%s.%s" % (jn[keys[0]], jn[keys[1]], jn[keys[2]],
                                 jn[keys[3]], jn[keys[4]])
        return ss

    def save(self, k, v):
        if k in self.nks:
            return True
        self.nks[k] = 1
        self.nfs.append(k, v)

    def isRed(self, col):
        hcode = html.tostring(col)
        m = re.search(ur'color\s*:\s*Red', hcode, re.I)
        if m:
            return True
        return False

    def go_(self, fr):
        while True:
            n, v = fr.readone()
            if n is None:
                return
            jn = json.loads(n)
            nkey = self.get_nkey(jn)
            print nkey
            if '系统检索不到您所查询的相关信息' in v:
                self.save(nkey, 'None')
                continue
            try:
                doc = html.fromstring(v)
                tbl = doc.xpath("//table[@id='tablecloth']")[0]

                otbl = []
                rowno = 0
                for rows in list(tbl):
                    rowno += 1
                    if rowno == 1:
                        continue
                    currow = []
                    colid = 0
                    for cols in rows:
                        colid += 1
                        t = re.sub(ur'\s+', u' ', cols.text_content().strip())
                        if colid == 4 and self.isRed(cols):
                            t += ".red"
                        currow.append(t)
                    otbl.append(currow)
                #print nkey, json.dumps(otbl, ensure_ascii=0).encode('utf8')
                self.save(nkey,
                          json.dumps(otbl, ensure_ascii=0).encode('utf8'))
            except Exception as e:
                print v
                raise

    def go(self):
        fns = ['fo.bin']
        for fn in fns:
            fr = BinReader(fn)
            self.go_(fr)
示例#9
0
文件: gs1.py 项目: wolfwhoami/xxxxx
class GSSPider(Spider):
    def __init__(self, tc):
        Spider.__init__(self, tc)
        self._logport = 5556
        # self.channel = 'gsid'
        # self.job_queue = 'gsid'
        self.savebin = BinSaver("gongshang.bin")
        self.faillog = open("fail_list.txt", "w+b")

    def loadhans(self, fn):
        c = ''
        with open(fn) as f:
            c = f.read().decode('utf-8')
        all = {}
        for i in c:
            if ord(i) > 0x400:
                all[i] = 1
        cc = all.keys()
        if len(cc) == 0:
            raise RuntimeError("no hans loaded")
        # print json.dumps(cc, ensure_ascii=False).encode('utf-8')
        return cc

    def query_summary(self, areacode, qword, page=1):
        # hashStr = hashlib.md5(str(time.time())).hexdigest()
        # print hashStr
        # CookieStr = hashStr[0:8]+"-"+hashStr[9:13]+"-"+hashStr[14:18]+"-"+hashStr[19:23]+"-"+hashStr[24:32]
        # print "COOKIE:%s" % CookieStr
        CookieStr = "E1F3418D-BDC7-468D-9F43-6EA13A642356"

        headers = {
            'User-Agent':
            'Mozilla/5.0 (iPhone;8.0.2;iPhone;iPhone);Version/1.1;ISN_GSXT',
            "Cookie": CookieStr
        }
        data = {'AreaCode': areacode, 'Limit': 50, 'Page': page, 'Q': qword}
        # print data
        con = self.request_url('https://120.52.121.75:8443/QuerySummary',
                               headers=headers,
                               data=data,
                               verify=False)
        # try:
        if con is None:
            Log.error("query %s, connect failed! " % qword)
            raise ConnectFailError("query %s, connect failed! " % qword)
        if "502 Bad Gateway" in con.text:
            Log.error("query %s, 502! " % qword)
            raise BadGatewayError("query %s, 502! " % qword)
        j = json.loads(con.text)
        if j.get('ERRCODE') == '0':
            rs = j["RESULT"]
            if len(rs) is 0:
                Log.error("query %s, no data!" % qword)
            return rs
        else:
            Log.error("query %s, request error! Response: %s" % (qword, j))
            raise RequestError("query %s, request error! " % qword)
            # except Exception as e:
            #     print e
            #     return None

    def query_info(self, areacode, regNo, page=1):
        CookieStr = "E1F3418D-BDC7-468D-9F43-6EA13A642356"

        headers = {
            'User-Agent':
            'Mozilla/5.0 (iPhone;8.0.2;iPhone;iPhone);Version/1.1;ISN_GSXT',
            "Cookie": CookieStr
        }
        data = {
            'AreaCode': areacode,
            'Limit': 50,
            'Page': page,
            'Q': regNo,
            'EndNo': regNo
        }
        # print data
        con = self.request_url('https://120.52.121.75:8443/QuerySummary',
                               headers=headers,
                               data=data,
                               verify=False)
        # try:
        if con is None:
            Log.error("query %s, connect failed! " % regNo)
            raise ConnectFailError("query %s, connect failed! " % regNo)
        if "502 Bad Gateway" in con.text:
            Log.error("query %s, 502! " % regNo)
            raise BadGatewayError("query %s, 502! " % regNo)
        j = json.loads(con.text)
        if j.get('ERRCODE') == '0':
            rs = j["RESULT"]
            if len(rs) is 0:
                Log.error("query %s, no data!" % regNo)
            return rs
        else:
            Log.error("query %s, request error! Response: %s" % (regNo, j))
            raise RequestError("query %s, request error! " % regNo)

    def save_qs(self, qs, qid, areacode, qw):
        for t in qs:
            print json.dumps(t, ensure_ascii=0, indent=4).encode('utf-8')
            t['qinfo'] = {'qid': qid, 'areacode': areacode, 'qw': qw}
            # self.col.update({'REGNO':t['REGNO'], 'ENTNAME':t['ENTNAME']}, t, True)

    def dispatch_hans1(self):
        self.hans = self.loadhans('hans')
        for i in self.hans:
            for k in area_data:
                # self.job_queue.put({'qw':i, 'areacode':k[0], 'tp':'w'})
                pass

    def dispatch(self):
        with open("r1k.txt", 'r') as f:
            for lines in f:
                m = re.match(u'\d+\s+(\d+)\s+(.*)', lines)
                if m:
                    code = int(m.group(1))
                    l = m.group(2)
                    if code == 0:
                        for k in area_data:
                            self.add_main_job({
                                'qw': l,
                                'areacode': k[0],
                                'tp': 'w'
                            })
                    else:
                        self.add_main_job({
                            'qw': l,
                            'areacode': code,
                            'tp': 'w'
                        })

        self.wait_q()
        self.job_queue.put(None)

    def run_job(self, job):
        self.faillog.write("%s failed.\n" % job)
        if isinstance(job, dict):
            tp = job.get('tp')
            areacode = job.get('areacode')
            qid = -1
            if tp == 'w':
                qw = job['qw']
            if tp == 'w2':
                qid = job.get('qid')
                xxx = qid / len(self.hans)
                yyy = qid % len(self.hans)
                if xxx >= yyy:
                    return True
                qw = self.hans[xxx] + self.hans[yyy]

            if tp == 'w' or tp == 'w2':
                page = job.get('page', 1)
                print qw, areacode, page
                # self.savebin.append(qw)
                try:
                    qs = self.query_summary(areacode, qw, page)
                except HttpError:
                    readd_count = job.get("readd_count", "0").strip()
                    if int(readd_count) < 10:
                        job["readd_count"] = str(int(readd_count) + 1)
                        Log.warning("readd job %s" % qw)
                        self.re_add_job(job)
                    else:
                        Log.error("job %s has run 30 times. " % qw)
                        self.faillog.write("%s failed.\n" % job)
                    time.sleep(1)
                    return
                self.savebin.append(qw, qs.__str__())
                Log.warning("%s saved" % qw)
            time.sleep(1)