Exemplo n.º 1
0
def getProxy(renew=False):
    global pIPs, globalProxyCount, pipObj
    # while 1:
    try:
        # count = 100
        if len(pIPs) < minPIPCount:
            # 代理ip太少,重新获取
            pIPs = getAvailableIPs()
        globalProxyCount = globalProxyCount + 1
        if globalProxyCount % 100 == 0 or renew:
            pipObj = random.choice(pIPs)
            print 'globalProxyCount:', str(
                globalProxyCount), ' change proxyIp to ', str(pipObj)
            pIPs.remove(pipObj)
            globalProxyCount = 0

        # randomPIpIndex = random.randint(0, len(pIPs) - 1)
        # pipObj = pIPs[randomPIpIndex]
        pIp = pipObj[0]
        pPort = pipObj[1]

        # del pIPs[randomPIpIndex]
        # pIPs.remove(pipObj)

        # 删除ip
        # deletByIP(pIp)
        proxy = {
            'http': 'http://%s:%s' % (pIp, pPort),
            'https': 'http://%s:%s' % (pIp, pPort)
        }
        return proxy
    except Exception as e:
        print 'get proxy exception: ', e
Exemplo n.º 2
0
def getNewBrowserArgs():
    pIPs = getAvailableIPs()
    pipObj = random.choice(pIPs)
    # randomPIpIndex = random.randint(0, len(pIPs))
    # pipObj = pIPs[randomPIpIndex]
    pIp = pipObj[0]
    pPort = pipObj[1]
    # ua = random.choice(USER_AGENTS)
    # caps["phantomjs.page.settings.userAgent"] = ua
    # proxy = webdriver.Proxy()
    # proxy.proxy_type = ProxyType.MANUAL
    # proxy.http_proxy = pIp + ':' + str(pPort)
    # # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    # proxy.add_to_capabilities(caps)
    # driver.start_session(caps)

    # if driver:
    #     try:
    #         # driver.close()
    #         driver.quit()
    #     except Exception as er:
    #         print er
    caps = webdriver.DesiredCapabilities.PHANTOMJS

    ua = random.choice(USER_AGENTS)
    caps["phantomjs.page.settings.userAgent"] = ua

    service_args = [
        '--proxy=' + pIp + ':' + str(pPort),
        '--proxy-type=http',
    ]

    return service_args, caps
Exemplo n.º 3
0
def reflashProxy(caps, driver, pIPs):
    if len(pIPs) < minPIPCount:
        # 代理ip太少,重新获取
        pIPs = getAvailableIPs()
    # pipObj = random.choice(pIPs)
    randomPIpIndex = random.randint(0, len(pIPs))
    pipObj = pIPs[randomPIpIndex]
    pIp = pipObj[0]
    pPort = pipObj[1]
    ua = random.choice(USER_AGENTS)
    caps["phantomjs.page.settings.userAgent"] = ua
    proxy = webdriver.Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    proxy.http_proxy = pIp + ':' + str(pPort)
    # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
    proxy.add_to_capabilities(caps)
    driver.start_session(caps)
    return pIPs, pIp, randomPIpIndex
Exemplo n.º 4
0
    #     t = int(sys.argv[2])
    # qichachaFromIndustry(f,t)

    #从投资接口开始
    # fromInvestInt()

    #搜索页面
    # while 1:
    #     for length in range(10,11):
    #         try:
    #             startFromSearch(length)
    #         except Exception as e:
    #             print 'job fail, e:',traceback.format_exc()

    # 页面推荐入口
    pIPs = getAvailableIPs()

    while 1:
        try:
            count = 100
            if len(pIPs) < minPIPCount:
                # 代理ip太少,重新获取
                pIPs = getAvailableIPs()
            pipObj = random.choice(pIPs)
            # randomPIpIndex = random.randint(0, len(pIPs) - 1)
            # pipObj = pIPs[randomPIpIndex]
            pIp = pipObj[0]
            pPort = pipObj[1]

            # del pIPs[randomPIpIndex]
            pIPs.remove(pipObj)
Exemplo n.º 5
0
def tradMarkTestById(f, t):
    # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

    # driver = webdriver.PhantomJS(executable_path=phantomPath,service_args=service_args)
    # caps = webdriver.DesiredCapabilities.PHANTOMJS

    ua = random.choice(USER_AGENTS)
    # caps["phantomjs.page.settings.userAgent"] = ua

    pIPs = getAvailableIPs()
    print 'start with ' + str(len(pIPs)) + ' proxy ips'

    # startWithDriver(driver, f, t)
    baseUrl = 'http://202.108.90.73/txnS03.do'
    count = 0
    startTime = time.time()
    lastCountTime = time.time()

    # pIPs, pIp, randomPIpIndex = reflashProxy(caps, driver, pIPs)

    pipOk = False

    maxCountPerProxy = 5
    nowCount = 0

    driver = None

    noNeedReStart = False

    for category in range(f, t + 1):

        if not pipOk or nowCount > maxCountPerProxy:
            # pIPs, pIp, randomPIpIndex = reflashProxy(caps, driver, pIPs)
            if len(pIPs) < minPIPCount:
                # 代理ip太少,重新获取
                pIPs = getAvailableIPs()
            pipObj = random.choice(pIPs)
            # randomPIpIndex = random.randint(0, len(pIPs))
            # pipObj = pIPs[randomPIpIndex]
            pIp = pipObj[0]
            pPort = pipObj[1]
            # ua = random.choice(USER_AGENTS)
            # caps["phantomjs.page.settings.userAgent"] = ua
            # proxy = webdriver.Proxy()
            # proxy.proxy_type = ProxyType.MANUAL
            # proxy.http_proxy = pIp + ':' + str(pPort)
            # # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
            # proxy.add_to_capabilities(caps)
            # driver.start_session(caps)

            if driver:
                try:
                    # driver.close()
                    driver.quit()
                except Exception as er:
                    print er
            caps = webdriver.DesiredCapabilities.PHANTOMJS

            ua = random.choice(USER_AGENTS)
            caps["phantomjs.page.settings.userAgent"] = ua

            service_args = [
                '--proxy=' + pIp + ':' + str(pPort),
                '--proxy-type=http',
            ]

            noNeedReStart = False

            nowCount = 0
        else:
            nowCount = nowCount + 1

        try:

            if not noNeedReStart:
                driver = webdriver.PhantomJS(executable_path=phantomPath,
                                             service_args=service_args,
                                             desired_capabilities=caps)

                driver.set_page_load_timeout(30)
                # driver.set_window_size(1366,768)
                driver.get(baseUrl)
            else:
                driver.refresh()

            pipOk = True

            catInputTag = driver.find_element_by_css_selector(
                '.inputbox input')
            catInputTag.send_keys(category)

            # nameInputTag = driver.find_element_by_css_selector('#mn')
            # nameInputTag.send_keys(word)

            submitTag = driver.find_element_by_css_selector('#_searchButton')
            submitTag.click()

            # print 'before submit url', driver.current_url
            time.sleep(0.1)
            windowsHandler = driver.window_handles
            if len(windowsHandler) < 2:
                print 'not open result page, skip'
                noNeedReStart = True
                continue
            driver.switch_to.window(windowsHandler[1])
            # print 'after submit switch to new tab,  url', driver.current_url
            list_box = driver.find_element_by_css_selector('.list_box')
            if not list_box:
                print 'no result list, content: ', driver.page_source
                noNeedReStart = True
                continue

            resList = list_box.find_elements_by_css_selector('tr')
            resultLength = len(resList)
            print 'result count:', resultLength - 1
            # if resultLength == 2:
            #     print driver.page_source

            for resultIndx in range(1, resultLength):
                resTr = resList[resultIndx]
                for link in resTr.find_elements_by_css_selector('a'):
                    link.click()

                    windowsHandler = driver.window_handles

                    if len(windowsHandler) < 2:
                        print 'not open result page, skip'
                        noNeedReStart = True

                        continue
                    driver.switch_to.window(windowsHandler[2])

                    # linkUr = urlparse.urljoin(baseUrl, link['href'])
                    print 'walking result urls , now :', driver.current_url
                    # driver.get(linkUr)

                    # print 'deatail page length: ', len(driver.page_source)

                    # datas = []
                    # for info in driver.find_elements_by_css_selector('.info'):
                    #     if(len(info.text) < 20):
                    #         datas.append(info.text)

                    # print 'datas:',repr(datas).decode("unicode-escape")
                    count = count + 1
                    if count % 10 == 0:
                        spentTime = time.time() - startTime
                        print 'count: ', count, ' took ', spentTime, ' seconds, avg_per_sec: ', count / spentTime, \
                            ' last 10 spent ', (time.time() - lastCountTime), ' secs'
                        lastCountTime = time.time()

                    # print 'close current detail windows'
                    # driver.close()

                    try:
                        driver.close()
                        # driver.quit()
                    except Exception as er:
                        print er

                    print 'switch to results page'
                    driver.switch_to.window(windowsHandler[1])

                    try:
                        driver.close()
                        # driver.quit()
                    except Exception as er:
                        print er
                    break
            # driver.quit()
            # driver.close()
            # windowsHandler = driver.window_handles
            # randomTime = random.randint(500, )
            # time.sleep(randomTime / 100)

            driver.switch_to.window(windowsHandler[0])
            # driver.close()
            # driver.quit()

            # try:
            #     driver.close()
            #     # driver.quit()
            # except Exception as er:
            #     print er
            # try:
            #     # driver.close()
            #     driver.quit()
            # except Exception as er:
            #     print er
            noNeedReStart = True

            print 'finish categoty:', category, ' reflash search page'
        except Exception as e:
            print 'id: ', category, ' error:', e
            print 'proxy ip:', pIp, ' port:', pPort

            # randomTime = random.randint(100, 500)
            # time.sleep(randomTime / 100)
            # print 'source: ', driver.page_source

            # driver.quit()
            try:
                driver.close()
                # driver.quit()
            except Exception as er:
                print er
            try:
                # driver.close()
                driver.quit()
            except Exception as er:
                print er
            noNeedReStart = False

            # del pIPs[randomPIpIndex]
            pIPs.remove(pipObj)
            # deletByIP(pIp)

            pipOk = False