示例#1
0
文件: worker.py 项目: miku/brozzler
    def _warcprox_write_record(
            self, warcprox_address, url, warc_type, content_type,
            payload, extra_headers=None):
        headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
        if extra_headers:
            headers.update(extra_headers)
        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                headers=headers, data=payload)

        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
        request.type = "http"
        request.set_proxy(warcprox_address, "http")

        try:
            with urllib.request.urlopen(request, timeout=600) as response:
                if response.getcode() != 204:
                    self.logger.warning(
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warning(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
        except ConnectionError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
示例#2
0
def login(url, username, password, timeout, proxy):
    global userAgent

    url = urllib.parse.urljoin(url, "/wp-login.php/")
    form = "log={}&pwd={}".format(username, password)
    form = bytes(form, "utf-8")
    headers = {"User-Agent": random.choice(userAgent)}

    try:
        request = urllib.request.Request(url, data=form, headers=headers)

        if proxy is not None:
            request.set_proxy(proxy, ["http", "https"])

        with urllib.request.urlopen(request, timeout=timeout,
                                    context=context) as response:
            if re.search("wp-admin", response.url):
                return password
            else:
                return False
    except urllib.error.URLError as error:
        log.critical(error)
        os._exit(0)
    except Exception as error:
        log.critical(error)
        os._exit(0)
示例#3
0
def multi_urlopen(str_Url, Post_Data=None, timeout=30, retryTime=3, proxy="", debug_flag=False):
    """最多尝试retryTime次以get或post方式读取指定网址,Post_Data数据无需urlencode编码"""
    for i_multi in range(0, retryTime):
        try:
            # opener1 = urllib2.build_opener()
            if Post_Data is None:
                request = urllib.request.Request(str_Url)
                request.add_header("User-Agent",
                                   "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0)\ Gecko/20100101 Firefox/38.0")
                request.add_header("Referer", str_Url)
                if str(proxy) != "":
                    request.set_proxy(str(proxy), "http")
                response_content = urllib.request.urlopen(request, timeout=timeout)
                # re_addinfourl = opener1.open(str_Url, timeout=timeout)
            else:
                request = urllib.request.Request(str_Url)
                request.add_header("User-Agent",
                                   "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0)\ Gecko/20100101 Firefox/38.0")
                request.add_header("Referer", str_Url)
                if proxy != "":
                    request.set_proxy(proxy, "http")
                response_content = urllib.request.urlopen(request, urllib.parse.urlencode(postdata2dict(Post_Data)), timeout=timeout)
        except Exception as e_m:
            if debug_flag:
                print("Open ", str_Url, " error at ", str(i_multi + 1), "time(s). Error info:\r\n", str(
                e_m), "\r\ntry again...\r\n")
        else:
            return response_content
    return None  # 运行到这表明耗尽了试错循环还没有得到正确的响应,返回空。
def get_last_by_date(action, date):
    url = "https://bt.group-ib.com/?module=get&action=get_last&date={0}&type={1}".format(
        date, action)

    log(">>>>Taking 'last' value from server by date {0}".format(date))

    headers = {
        "Accept": "application/json",
        "X-Auth-Login": API_USER,
        "X-Auth-Key": API_KEY
    }

    request = urllib.request.Request(url)

    if WITH_PROXY:
        proxy_host = PROXY_ADDRESS + ':' + PROXY_PORT
        request.set_proxy(proxy_host, PROXY_PROTOCOL)

    for key, value in headers.items():
        request.add_header(key, value)

    gcontext = ssl._create_unverified_context()
    handle = urllib.request.urlopen(request, context=gcontext)
    response = handle.read().decode('utf-8')

    result = json.loads(response)

    log("<<<<Got 'last' value: {0}".format(result["data"]))

    return result["data"]
示例#5
0
    def _warcprox_write_record(
            self, warcprox_address, url, warc_type, content_type,
            payload, extra_headers=None):
        headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
        if extra_headers:
            headers.update(extra_headers)
        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                headers=headers, data=payload)

        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
        request.type = "http"
        request.set_proxy(warcprox_address, "http")

        try:
            with urllib.request.urlopen(request, timeout=600) as response:
                if response.getcode() != 204:
                    self.logger.warn(
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
                return request, response
        except urllib.error.HTTPError as e:
            self.logger.warn(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
            return request, None
        except urllib.error.URLError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
        except ConnectionError as e:
            raise brozzler.ProxyError(
                    'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
示例#6
0
def _warcprox_write_record(
        warcprox_address, url, warc_type, content_type,
        payload, location=None, extra_headers=None):
    headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
    if location:
        headers['Location'] = location
    if extra_headers:
        headers.update(extra_headers)
    request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                                     headers=headers, data=payload)

    # XXX setting request.type="http" is a hack to stop urllib from trying
    # to tunnel if url is https
    request.type = "http"
    if warcprox_address:
        request.set_proxy(warcprox_address, "http")
        logger.debug("Connecting via "+warcprox_address)
    else:
        logger.error("Cannot write WARC records without warcprox!")
        return

    try:
        with urllib.request.urlopen(request) as response:
            if response.status != 204:
                logger.warning(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    response.status, response.reason)
    except urllib.error.HTTPError as e:
        logger.warning(
            'got "%s %s" response on warcprox '
            'WARCPROX_WRITE_RECORD request (expected 204)',
            e.getcode(), e.info())
示例#7
0
def download(url,
             timeout=10,
             user_agent='dyspider',
             proxy=None,
             num_retries=5):
    """
    通用网页源码下载函数
    :param url: 要下载的url
    :param timeout: 请求超时时间,单位/秒。可能某些网站的反应速度很慢,所以需要一个连接超时变量来处理。
    :param user_agent: 用户代理信息,可以自定义是爬虫还是模拟用户
    :param proxy: ip代理(http代理),访问某些国外网站的时候需要用到。必须是双元素元组或列表(‘ip:端口’,‘http/https’)
    :param num_retries: 失败重试次数
    :return: HTML网页源码
    """
    headers = {'User-Agent': user_agent}
    request = urllib.request.Request(url, headers=headers)
    if proxy:  # 如果有代理值,可以通过set_proxy方法添加上
        proxy_host, proxy_type = proxy  # 对proxy值进行解包
        request.set_proxy(proxy_host, proxy_type)
    print('Downloading:', url)
    try:
        # 打开网页并读取内容存入html变量中
        html = urllib.request.urlopen(request,
                                      timeout=timeout).read().decode('utf-8')
    except urllib.error.URLError as err:
        print('Download error:', err.reason)
        html = None  # 如果有异常,那么html肯定是没获取到的,所以赋值None
        if num_retries > 0:
            if hasattr(err, 'code') and 500 <= err.code <= 600:
                return download(url, timeout, user_agent, proxy,
                                num_retries - 1)
    return html
示例#8
0
def do_check(url):
    check_proxy = True
    try:
        if settings.VERBOSITY_LEVEL >= 1:
            info_msg = "Setting the HTTP proxy for all HTTP requests... "
            print((settings.print_info_msg(info_msg)))
        # Check if defined POST data
        if menu.options.data:
            request = urllib.request.Request(url, menu.options.data)
        else:
            request = urllib.request.Request(url)
        # Check if defined extra headers.
        headers.do_check(request)
        request.set_proxy(menu.options.proxy, settings.PROXY_SCHEME)
        try:
            check = urllib.request.parse(request)
        except urllib.error.URLError as error:
            check = error
    except:
        check_proxy = False
        pass
    if check_proxy == True:
        pass
    else:
        err_msg = "Unable to connect to the target URL or proxy ("
        err_msg += menu.options.proxy
        err_msg += ")."
        print((settings.print_critical_msg(err_msg)))
        raise sys.exit()
示例#9
0
def _warcprox_write_record(
        warcprox_address, url, warc_type, content_type,
        payload, location=None, extra_headers=None):
    headers = {"Content-Type": content_type, "WARC-Type": warc_type, "Host": "N/A"}
    if location:
        headers['Location'] = location
    if extra_headers:
        headers.update(extra_headers)
    request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                                     headers=headers, data=payload)

    # XXX setting request.type="http" is a hack to stop urllib from trying
    # to tunnel if url is https
    request.type = "http"
    if warcprox_address:
        request.set_proxy(warcprox_address, "http")
        logger.debug("Connecting via "+warcprox_address)
    else:
        logger.info("Cannot write WARC records without warcprox!")
        return

    try:
        with urllib.request.urlopen(request) as response:
            if response.status != 204:
                logger.warning(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    response.status, response.reason)
    except urllib.error.HTTPError as e:
        logger.warning(
            'got "%s %s" response on warcprox '
            'WARCPROX_WRITE_RECORD request (expected 204)',
            e.getcode(), e.info())
示例#10
0
def get_html(url, proxy=None):
    ua = UserAgent()
    header = ua.random

    try:
        request = urllib.request.Request(url)
        request.add_header("User-Agent", header)
        if proxy:
            print("Using proxy:", proxy)
            request.set_proxy(proxy, 'http')
        html = urllib.request.urlopen(request).read()
        return html
    except urllib.error.HTTPError as e:
        print("Error accessing:", url)
        print(e)
        if e.code == 503 and 'CaptchaRedirect' in e.read():
            print("Google is requiring a Captcha. "
                  "For more information check: 'https://support.google.com/websearch/answer/86640'")
        if e.code == 503:
            sys.exit("503 Error: service is currently unavailable. Program will exit.")
        return None
    except Exception as e:
        print("Error accessing:", url)
        print(e)
        return None
示例#11
0
def process_url(url):
    global total_requests_made
    global proxy_index
    global proxy
    global enable_proxy
    total_requests_made += 1

    if enable_proxy:
        # Every 10 requests, generate a new proxy
        if total_requests_made % 10 == 0:
            proxy_index = random_proxy()
            proxy = proxies[proxy_index]

        for _ in range(0,4):
            try:
                user_agent = random.choice(user_agent_list)
                request = urllib.request.Request(url,headers={'User-Agent': user_agent})
                request.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
                print(proxy)
                response = urllib.request.urlopen(request)
                break
            except:
                del proxies[proxy_index]
                proxy_index = random_proxy()
                proxy = proxies[proxy_index]
    else:
        user_agent = random.choice(user_agent_list)
        request = urllib.request.Request(url,headers={'User-Agent': user_agent})
        response = urllib.request.urlopen(request)

    
    
    html = response.read()

    soup = bs.BeautifulSoup(html, 'lxml')
 
    campaign_title = soup.find('h1', {'class': ['a-campaign-title']}).text.strip()
    raised_vs_goal = soup.find('h2', {'class': ['m-progress-meter-heading']}).text.strip()
    campaign_story = soup.find('div', {'class': ['o-campaign-story']}).text.strip().replace("\n", " ")
    
    
    cover_image = soup.find('div', {'class':['a-image']}).attrs['style']
    cover_image_url = cover_image[cover_image.find("(")+1:cover_image.find(")")]

    story_images = soup.find_all('img', {'class':['not-resizable']})

    story_image_urls = [story_image.attrs['src'].strip() for story_image in story_images]
    num_story_images = len(story_image_urls)

    creation_date = soup.find('span', {'class': ['m-campaign-byline-created']}).text.strip()
    
    byline = soup.find('div', {'class': ['m-campaign-byline-description']}).text.strip()

    campaign_type = soup.find('a', {'class':['m-campaign-byline-type']}).text.strip()
    
    return (url.strip(), campaign_title.strip(), campaign_story.strip(), cover_image_url.strip(), story_image_urls, num_story_images, creation_date.strip(), byline.strip(), campaign_type.strip(), raised_vs_goal.strip())
示例#12
0
 def run(self):
     request = urllib.request.Request(self.link, headers=self.headers)
     if self.host:
         request.set_proxy(self.host, "https")
     try:
         with urllib.request.urlopen(request, timeout=self.timeout) as uin:
             if self.verbose:
                 print(uin.read())
     except:
         pass
示例#13
0
def extract_video(video_id: str, quality: str = None):
    url = "https://www.cda.pl/video/" + video_id + ("?wersja=" +
                                                    quality if quality else "")

    # Trying to avoid as much trouble as possbile by "mocking" a real browser request
    request = urllib.request.Request(
        url,
        headers={
            "Referer":
            "http://www.cda.pl",
            "User-Agent":
            "Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0"
        })

    # Set proxy to avoid crappy CDNs
    if HTTP_PROXY:
        request.set_proxy(HTTP_PROXY, "http")

    try:
        response = urllib.request.urlopen(request).read()
    except HTTPError as e:
        response = e.read()
    except URLError as e:
        raise e

    # Parse HTML using BeautifulSoup4
    bs4 = BeautifulSoup(response, "html.parser")
    body = bs4.find("body")

    for tag in body.find_all(text=True):
        if tag.string == "Ten film jest dostępny dla użytkowników premium":
            raise PremiumOnlyError()
        elif tag.string == "Materiał został usunięty!":
            raise FileDeletedError()

    # Parse list of available video quality
    quality_list = [
        tag.string for tag in body.find_all("a", {"class": "quality-btn"})
    ]

    if quality and quality not in quality_list:
        raise QualityError()

    title = body.find("span", {"class": "title-name"}).get_text()
    player_data = json.loads(
        body.find("div", {"player_data": True})["player_data"])

    return {
        "title":
        title,
        "src":
        "https://" + decrypt_file(unquote(player_data["video"]["file"])) +
        ".mp4"
    }
示例#14
0
 def run(self):
     request = urllib.request.Request(self.link, headers=self.headers)
     if self.host:
         request.set_proxy(self.host, "http")
     try:
         with urllib.request.urlopen(request, timeout=self.timeout) as uin:
             uin.read()
         if self.verbose:
             print("{} {} OK".format(self.link, self.host))
     except urllib.request.URLError as message:
         if self.verbose:
             print("{} {} {}".format(self.link, self.host, self.message))
示例#15
0
def __post_text(incomming_webhook_url, text):
    data = json.dumps({"text": text}).encode("utf-8")
    headers = {"Content-Type" : "application/json"}
    request = urllib.request.Request(incomming_webhook_url, data, method="POST", headers=headers)

    # For debug purpose
    if os.getenv("HTTPS_PROXY"):
        request.set_proxy(os.getenv("HTTPS_PROXY"), "https")
    
    with urllib.request.urlopen(request) as response:
        response_body = response.read().decode("utf-8")
            
    return __response_200(response_body)
示例#16
0
 def get(self):
     """
     Get a url and return a redeable object with the raw html retrived
     """
     request = urllib.request.Request(self.url)
     if self.proxy:
         request.set_proxy(self.proxy, 'http')
     logger.info("Attempt to do GET request: url=%s, proxy=%s", self.url,
                 self.proxy)
     response = urllib.request.urlopen(request)
     logger.info("GET request was successful: url=%s, proxy=%s", self.url,
                 self.proxy)
     return response
示例#17
0
def get(api_url,
        headers={'Content-Type': 'application/json'},
        proxy: str = '',
        max_retries=5):
    response = None
    retries = 0
    request = urllib.request.Request(api_url, headers=headers)
    if proxy != '':
        request.set_proxy(proxy, 'http')
    while retries < max_retries:
        time.sleep(1)  # give the api a rest
        try:
            with urllib.request.urlopen(request) as response:
                response_content = response.read()
                if response.status == 200:
                    retries = 0
                    break
        except:
            log_msg = {
                'timestamp': str(datetime.datetime.now().isoformat()),
                'event': 'connection_error',
                'url': api_url,
                'retries': retries,
            }
            print(json.dumps(log_msg), file=sys.stderr)
            retries += 1

    if response and response.status == 200:
        return response_content
    elif response and response.status == 401:
        log_msg = {
            'timestamp': str(datetime.datetime.now().isoformat()),
            'event': 'connection_error',
            'url': api_url,
            'message': 'Unauthorized',
        }
        print(json.dumps(log_msg), file=sys.stderr)
        return None
    else:
        if retries == max_retries:
            log_msg = {
                'timestamp': str(datetime.datetime.now().isoformat()),
                'event': 'max_retries',
                'url': api_url,
            }
            print(json.dumps(log_msg), file=sys.stderr)
        return None
示例#18
0
def httpRequest(url,proxy = None):
    try:
        ret = None
        SockFile = None
        request = Request(url)
        request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)')
        request.add_header('Pragma', 'no-cache')
        if proxy:
            request.set_proxy(proxy,'http')
        opener = urllib.request.build_opener()
        SockFile = opener.open(request)
        ret = SockFile.read().decode('utf-8')
    finally:
        if SockFile:
            SockFile.close()
    
    return ret       
示例#19
0
    def _warcprox_write_record(self, warcprox_address, url, warc_type, content_type, payload, extra_headers=None):
        headers = {"Content-Type":content_type,"WARC-Type":warc_type,"Host":"N/A"}
        if extra_headers:
            headers.update(extra_headers)
        request = urllib.request.Request(url, method="WARCPROX_WRITE_RECORD",
                headers=headers, data=payload)

        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
        request.type = "http"
        request.set_proxy(warcprox_address, "http")

        try:
            with urllib.request.urlopen(request) as response:
                if response.status != 204:
                    self.logger.warn("""got "{} {}" response on warcprox WARCPROX_WRITE_RECORD request (expected 204)""".format(response.status, response.reason))
        except urllib.error.HTTPError as e:
            self.logger.warn("""got "{} {}" response on warcprox WARCPROX_WRITE_RECORD request (expected 204)""".format(e.getcode(), e.info()))
示例#20
0
def start(url, usr, pwd, timeout, ua, proxy):
    form = {"log": usr, "pwd": pwd}
    url = urllib.parse.urljoin(url, "/wp-login.php/")
    form = urllib.parse.urlencode(form).encode()
    try:
        request = urllib.request.Request(url,
                                         data=form,
                                         headers={"User-Agent": ua})
        if proxy != "":
            request.set_proxy(proxy, "http")

        with urllib.request.urlopen(request, timeout=timeout) as respond:
            if re.search("wp-admin", respond.url):
                return pwd
    except urllib.error.URLError as err:
        raise Exception(
            "no address associated with hostname, please check ur target URL or proxy host"
        )
    except Exception as err:
        raise Exception(err)
示例#21
0
文件: web.py 项目: sudokode/Limnoria
def getUrlFd(url, headers=None, data=None, timeout=None):
    """getUrlFd(url, headers=None, data=None, timeout=None)

    Opens the given url and returns a file object.  Headers and data are
    a dict and string, respectively, as per urllib.request.Request's
    arguments."""
    if headers is None:
        headers = defaultHeaders
    if minisix.PY3 and isinstance(data, str):
        data = data.encode()
    try:
        if not isinstance(url, Request):
            (scheme, loc, path, query, frag) = urlsplit(url)
            (user, host) = splituser(loc)
            url = urlunsplit((scheme, host, path, query, ''))
            request = Request(url, headers=headers, data=data)
            if user:
                request.add_header('Authorization',
                                   'Basic %s' % base64.b64encode(user))
        else:
            request = url
            request.add_data(data)
        httpProxy = force(proxy)
        if httpProxy:
            request.set_proxy(httpProxy, 'http')
        fd = urlopen(request, timeout=timeout)
        return fd
    except socket.timeout as e:
        raise Error(TIMED_OUT)
    except sockerrors as e:
        raise Error(strError(e))
    except InvalidURL as e:
        raise Error('Invalid URL: %s' % e)
    except HTTPError as e:
        raise Error(strError(e))
    except URLError as e:
        raise Error(strError(e.reason))
    # Raised when urllib doesn't recognize the url type
    except ValueError as e:
        raise Error(strError(e))
示例#22
0
文件: web.py 项目: Ban3/Limnoria
def getUrlFd(url, headers=None, data=None, timeout=None):
    """getUrlFd(url, headers=None, data=None, timeout=None)

    Opens the given url and returns a file object.  Headers and data are
    a dict and string, respectively, as per urllib.request.Request's
    arguments."""
    if headers is None:
        headers = defaultHeaders
    if minisix.PY3 and isinstance(data, str):
        data = data.encode()
    try:
        if not isinstance(url, Request):
            (scheme, loc, path, query, frag) = urlsplit(url)
            (user, host) = splituser(loc)
            url = urlunsplit((scheme, host, path, query, ''))
            request = Request(url, headers=headers, data=data)
            if user:
                request.add_header('Authorization',
                                   'Basic %s' % base64.b64encode(user))
        else:
            request = url
            request.add_data(data)
        httpProxy = force(proxy)
        if httpProxy:
            request.set_proxy(httpProxy, 'http')
        fd = urlopen(request, timeout=timeout)
        return fd
    except socket.timeout as e:
        raise Error(TIMED_OUT)
    except sockerrors as e:
        raise Error(strError(e))
    except InvalidURL as e:
        raise Error('Invalid URL: %s' % e)
    except HTTPError as e:
        raise Error(strError(e))
    except URLError as e:
        raise Error(strError(e.reason))
    # Raised when urllib doesn't recognize the url type
    except ValueError as e:
        raise Error(strError(e))
示例#23
0
def use_proxy(request):
    headers.do_check(request)
    request.set_proxy(menu.options.proxy, settings.PROXY_SCHEME)
    try:
        response = urlopen(request)
        return response

    except httplib.BadStatusLine as e:
        err_msg = "Unable to connect to the target URL or proxy ("
        err_msg += menu.options.proxy
        err_msg += ")."
        print((settings.print_critical_msg(err_msg)))
        raise sys.exit()

    except Exception as err_msg:
        try:
            error_msg = str(err_msg.args[0]).split("] ")[1] + "."
        except IndexError:
            error_msg = str(err_msg).replace(": ", " (") + ")."
        print((settings.print_critical_msg(error_msg)))
        raise sys.exit()


# eof
示例#24
0
def readList(url):
    do = True
    sleepSecond = 1
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    
    ips = [];
    ports = [];

    while do:
        time.sleep(sleepSecond)
        try:
            request = urllib.request.Request(url, headers = headers)
            response = urllib.request.urlopen(request);
            html = response.read();
            html = html.decode('utf-8')
            soup = BeautifulSoup(html);
            ul = soup.find(attrs={"id":"ip_list"})
            trs = ul.findAll('tr', attrs={"class":"odd"})
            trs2 = ul.findAll('tr', attrs={"class":""})
            trs.extend(trs2)
            for tr in trs:
                tds = tr.findAll('td')
                now = 0
                for td in tds:
                    if (now == 1):
                        ip = td.getText()
                    if (now == 2):
                        port = td.getText()
                    now = now + 1
                
                print ("ip = %s, port = %s" % (ip, port))
                proxy = '%s:%s' % (ip, port)
                
                request = urllib.request.Request("http://m.weibo.cn/api/statuses/repostTimeline?id=4086433809517314&page=20", headers = headers)
                request.set_proxy(proxy, 'http')
                ok = 0
                cnt = 5
                for i in range(1, cnt):
                    ok = ok + 1
                    if (ok < i):
                        break
                    try:
                        urllib.request.urlopen(request, timeout = 1)
                        jsonBytes = response.read()
                        jsonString = jsonBytes.decode('utf8')
                        json.loads(jsonString)
                    except:
                        ok = ok - 1
                if (ok >= cnt - 1):
                    print ("check success %d ==========================================================" % (ok))
                    ips.append(ip);
                    ports.append(port);
                else:
                    print ("check fail %d" % (ok))
        except URLError as e:
            print(e);
        else:
            do = False;
    return (ips, ports)
示例#25
0
homework_scores = {}

too_fast_submissions = {}

all_one_shot_submissions = []

for handle in handles:
    url = BASE_URL + '?lang=ru&handle={}&from=1&count=1000'.format(handle)
    print(url)
    connected = False
    while not connected:
        try:
            request = urllib.request.Request(url)
            if os.environ['PROXY'] is not None:
                print('setting proxy to {}'.format(os.environ['PROXY']))
                request.set_proxy(os.environ['PROXY'], 'https')
            response = urllib.request.urlopen(request)
            content = response.read()
            connected = True
        except OSError as err:
            print("OS Error: {}".format(err))
            connected = False

    with codecs.open('reports/{}.html'.format(handle), 'w', "utf-8") as file:
        file.write(HEADER)
        file.write(
            '<div class="container"><div class="row"><div class="col-md-12">')
        file.write(
            '<h2 class="page-header">Отчёт о выполнении домашнего задания <a href="https://codeforces.com/profile/{}">{}</a></h2><hr />'
            .format(handle, handle))
        response = json.loads(content)
示例#26
0
def send(action, last, lang):

    url = API_URL
    user = API_USER
    api_key = API_KEY

    if action == 'sample' or action == 'leaks':
        limit = BIG_DATA_LIMIT
    else:
        limit = LIMIT

    headers = {
        'Accept': 'application/json',
        'X-Auth-Login': user,
        'X-Auth-Key': api_key,
        'Connection': 'Keep-Alive',
        'Keep-Alive': 30
    }

    request_params = {
        "module": "get",
        "action": action,
        "limit": limit,
        "last": last
    }

    if lang is not None:
        request_params["lang"] = lang

    url = urllib.parse.urljoin(url,
                               '?' + urllib.parse.urlencode(request_params))

    log('>>Request: ' + url)

    #if WITH_PROXY:
    #    proxy_handler = '{"https": "https://127.0.0.1:3005"}'
    #    proxy_handler = json.loads(proxy_handler)
    #    proxy = urllib.request.ProxyHandler(proxy_handler)
    #    opener = urllib.request.build_opener(proxy)
    #    urllib.request.install_opener(opener)

    request = urllib.request.Request(url)

    if WITH_PROXY:
        proxy_host = PROXY_ADDRESS + ':' + PROXY_PORT
        request.set_proxy(proxy_host, PROXY_PROTOCOL)

    for key, value in headers.items():
        request.add_header(key, value)

    gcontext = ssl._create_unverified_context()
    handle = urllib.request.urlopen(request, context=gcontext)
    response = handle.read().decode('utf-8')

    result = json.loads(response)

    for_change_array = {
        "date_begin": "1971-01-01 00:00:00",
        "date_end": "1971-01-01 00:00:00",
        "date_detected": "1971-01-01 00:00:00",
        "advert_domain_registered": "1971-01-01 00:00:00",
        "date_registered": "1971-01-01",
        "date_expired": "1971-01-01",
        "date_published": "1971-01-01",
        "date_updated": "1971-01-01",
        "date_blocked": "1971-01-01 00:00:00",
        "date_not_before": "1971-01-01",
        "date_not_after": "1971-01-01",
        "date_compromised": "1971-01-01 00:00:00",
        "date_add": "1971-01-01 00:00:00",
        "date_incident": "1971-01-01 00:00:00",
        "operation_date": "1971-01-01 00:00:00",
        "date_publish": "1971-01-01 00:00:00",
        "date": "1971-01-01 00:00:00",
        "date_first_seen": "1971-01-01 00:00:00",
        "target_port": "0",
        "size": "0",
        "vt_all": "-",
        "vt_detected": "-",
        "phishing_kit_file_size": "0",
        "rows_count": "0"
    }

    for elem in result["data"]["new"]:
        for key, val in for_change_array.items():
            if key in elem["attrs"] and elem["attrs"][key] == None:
                elem["attrs"][key] = val

    try:
        error = result['error']
        raise Exception(error)
    except KeyError:
        pass

    try:
        count = result["data"]['count']
        last = result["data"]['last']
        limit = result["data"]['limit']
        count_new = len(result["data"]["new"])
        count_del = len(result["data"]["del"])
        log('<<Response param: count = {0}, last = {1}, limit = {2}, count new = {3}, count del = {4}'
            .format(count, last, limit, count_new, count_del))
    except KeyError:
        print('Bad response:' + response)

    return result