'ym': '',
    'neeaID': '',
    'cities': '',
    'citiesNames': '',
    'whichFirst': 'AS',
    'isFilter': 0,
    'isSearch': 1
}
QUERY_LIST = []
IS_LOGIN = False
WATCH_FLAG = False

POST_DATA = {'neeaID': '', 'pwd': ''}

CJ = CookieJar()
opener = build_opener(HTTPCookieProcessor(CJ), HTTPHandler)


class Common(object):
    """global config object"""
    def __init__(self):
        """load config from __config__"""
        self.CONFIG = ConfigParser()
        self.CONFIG.read(os.path.join(os.getcwd(), __config__),
                         encoding='utf-8')

        self.MAIL_HOST = self.CONFIG.get('email', 'host')
        self.MAIL_USER = self.CONFIG.get('email', 'user')
        self.MAIL_PASS = self.CONFIG.get('email', 'pass')
        self.SENDER = self.CONFIG.get('email', 'user')
        self.RECEIVERS = self.CONFIG.get('email', 'receivers')
示例#2
0
    def get_new_cookie(self):
        # Start by prompting user to input their credentials

        # Another Python2/3 workaround
        try:
            new_username = raw_input("Username: "******"Username: "******"Password (will not be displayed): ")

        # Build URS4 Cookie request
        auth_cookie_url = self.asf_urs4['url'] + '?client_id=' + self.asf_urs4['client'] + \
            '&redirect_uri=' + \
            self.asf_urs4['redir'] + '&response_type=code&state='

        try:
            # python2
            user_pass = base64.b64encode(bytes(new_username+":"+new_password))
        except TypeError:
            # python3
            user_pass = base64.b64encode(
                bytes(new_username+":"+new_password, "utf-8"))
            user_pass = user_pass.decode("utf-8")

        # Authenticate against URS, grab all the cookies
        self.cookie_jar = MozillaCookieJar()
        opener = build_opener(HTTPCookieProcessor(
            self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context))
        request = Request(auth_cookie_url, headers={
                          "Authorization": "Basic {0}".format(user_pass)})

        # Watch out cookie rejection!
        try:
            response = opener.open(request)
        except HTTPError as e:
            if "WWW-Authenticate" in e.headers and "Please enter your Earthdata Login credentials" in e.headers["WWW-Authenticate"]:
                print(
                    " > Username and Password combo was not successful. Please try again.")
                return False
            else:
                # If an error happens here, the user most likely has not confirmed EULA.
                print("\nIMPORTANT: There was an error obtaining a download cookie!")
                print(
                    "Your user appears to lack permission to download data from the ASF Datapool.")
                print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov")
                exit(-1)
        except URLError as e:
            print(
                "\nIMPORTANT: There was a problem communicating with URS, unable to obtain cookie. ")
            print("Try cookie generation later.")
            exit(-1)

        # Did we get a cookie?
        if self.check_cookie_is_logged_in(self.cookie_jar):
            # COOKIE SUCCESS!
            self.cookie_jar.save(self.cookie_jar_path)
            return True

        # if we aren't successful generating the cookie, nothing will work. Stop here!
        print("WARNING: Could not generate new cookie! Cannot proceed. Please try Username and Password again.")
        print("Response was {0}.".format(response.getcode()))
        print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov")
        exit(-1)
示例#3
0
 def new_opener(self):
     from cookielib import CookieJar
     from urllib2 import build_opener, HTTPCookieProcessor
     return build_opener(HTTPCookieProcessor(CookieJar()))
示例#4
0
文件: client.py 项目: 17Q/modules4all
def request(url,
            close=True,
            redirect=True,
            error=False,
            proxy=None,
            post=None,
            headers=None,
            mobile=False,
            XHR=False,
            limit=None,
            referer=None,
            cookie=None,
            compression=True,
            output='',
            timeout='30',
            ignoreSsl=False,
            flare=True,
            ignoreErrors=None):
    try:
        if url is None:
            return None

        handlers = []

        if proxy is not None:
            handlers += [ProxyHandler({'http': '%s' % (proxy)}), HTTPHandler]
            opener = build_opener(*handlers)
            opener = install_opener(opener)

        if output == 'cookie' or output == 'extended' or not close is True:
            cookies = cookielib.LWPCookieJar()
            handlers += [
                HTTPHandler(),
                HTTPSHandler(),
                HTTPCookieProcessor(cookies)
            ]
            opener = build_opener(*handlers)
            opener = install_opener(opener)

        if ignoreSsl or ((2, 7, 8) < sys.version_info < (2, 7, 12)):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [HTTPSHandler(context=ssl_context)]
                opener = build_opener(*handlers)
                opener = install_opener(opener)
            except:
                pass

        if url.startswith('//'):
            url = 'http:' + url

        try:
            headers.update(headers)
        except:
            headers = {}

        if 'User-Agent' in headers:
            pass
        elif mobile is not True:
            # headers['User-Agent'] = agent()
            headers['User-Agent'] = cache.get(randomagent, 1)
        else:
            headers['User-Agent'] = 'Apple-iPhone/701.341'

        if 'Referer' in headers:
            pass
        elif referer is not None:
            headers['Referer'] = referer

        if 'Accept-Language' not in headers:
            headers['Accept-Language'] = 'en-US'

        if 'X-Requested-With' in headers:
            pass
        elif XHR is True:
            headers['X-Requested-With'] = 'XMLHttpRequest'

        if 'Cookie' in headers:
            pass
        elif cookie is not None:
            headers['Cookie'] = cookie

        if 'Accept-Encoding' in headers:
            pass
        elif compression and limit is None:
            headers['Accept-Encoding'] = 'gzip'

        if redirect is False:

            class NoRedirection(HTTPErrorProcessor):
                def http_response(self, request, response):
                    return response

            opener = build_opener(NoRedirection)
            opener = install_opener(opener)

            try:
                del headers['Referer']
            except:
                pass

        if isinstance(post, dict):
            # Gets rid of the error: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
            try:
                iter_items = post.iteritems()
            except:
                iter_items = post.items()
            for key, value in iter_items:
                try:
                    post[key] = value.encode('utf-8')
                except:
                    pass

            post = urlencode(post)

        request = Request(url, data=post)
        _add_request_header(request, headers)

        try:
            response = urlopen(request, timeout=int(timeout))
        except HTTPError as response:
            try:
                ignore = ignoreErrors and (int(response.code) == ignoreErrors
                                           or int(
                                               response.code) in ignoreErrors)
            except:
                ignore = False

            if not ignore:
                if response.code in [301, 307, 308, 503]:
                    cf_result = response.read(5242880)
                    try:
                        encoding = response.info().getheader(
                            'Content-Encoding')
                    except:
                        encoding = None

                    if encoding == 'gzip':
                        cf_result = gzip.GzipFile(
                            fileobj=StringIO(cf_result)).read()

                    if flare and 'cloudflare' in str(response.info()).lower():
                        log_utils.log(
                            'client module calling cfscrape: url=%s' % url,
                            log_utils.LOGDEBUG)
                        try:
                            from openscrapers.modules import cfscrape
                            if isinstance(post, dict):
                                data = post
                            else:
                                try:
                                    data = parse_qs(post)
                                except:
                                    data = None

                            scraper = cfscrape.CloudScraper()
                            response = scraper.request(
                                method='GET' if post is None else 'POST',
                                url=url,
                                headers=headers,
                                data=data,
                                timeout=int(timeout))
                            result = response.content
                            flare = 'cloudflare'  # Used below
                            try:
                                cookies = response.request._cookies
                            except:
                                log_utils.error()

                        except:
                            log_utils.error()

                    elif 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s' % (urlparse(url).scheme,
                                              urlparse(url).netloc)
                        ua = headers['User-Agent']
                        cf = cache.get(cfcookie().get, 168, netloc, ua,
                                       timeout)
                        headers['Cookie'] = cf
                        request = Request(url, data=post)
                        _add_request_header(request, headers)
                        response = urlopen(request, timeout=int(timeout))
                    else:
                        log_utils.log(
                            'Request-Error (%s): %s' %
                            (str(response.code), url), log_utils.LOGDEBUG)
                        if error is False:
                            return
                else:
                    log_utils.log(
                        'Request-Error (%s): %s' % (str(response.code), url),
                        log_utils.LOGDEBUG)
                    if error is False:
                        return

        if output == 'cookie':
            try:
                result = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close is True:
                response.close()
            return result

        elif output == 'geturl':
            result = response.geturl()
            if close is True:
                response.close()
            return result

        elif output == 'headers':
            result = response.headers
            if close is True:
                response.close()
            return result

        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024):
                return
            result = response.read(16 * 1024)
            if close is True:
                response.close()
            return result

        elif output == 'file_size':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = '0'
            response.close()
            return content

        if flare != 'cloudflare':
            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None

        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)

            headers['Cookie'] = su

            request = Request(url, data=post)
            _add_request_header(request, headers)

            response = urlopen(request, timeout=int(timeout))

            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)

            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO(result)).read()

        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc)
            ua = headers['User-Agent']
            headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua,
                                          timeout)
            result = _basic_request(url,
                                    headers=headers,
                                    post=post,
                                    timeout=timeout,
                                    limit=limit)

        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1])
                                         for item in response.info().items()])
            except:
                response_headers = response.headers

            try:
                response_code = str(response.code)
            except:
                response_code = str(response.status_code
                                    )  # object from CFScrape Requests object.

            try:
                cookie = '; '.join(
                    ['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass

            try:
                cookie = cf
            except:
                pass

            if close is True:
                response.close()
            return (result, response_code, response_headers, headers, cookie)
        else:
            if close is True:
                response.close()
            return result

    except Exception as e:
        log_utils.log('Request-Error: (%s) => %s' % (str(e), url),
                      log_utils.LOGDEBUG)
        return
示例#5
0
    def __init__(self, url, **kwargs):
        """
        Request init
        """
        self.request = None
        self.response = None
        self.code = -1
        self.header = {}
        self.cookieJar = None
        self.reason = ''
        self.content = ''
        self.content_dict = {}

        # 是否将服务端返回结果从 json 转为 dict
        self.is_decode_response = kwargs.get('is_decode_response', False)

        data = kwargs.get('data', None)
        # 当请求是 GET 请求,同时传了 data 字典的话,post_type 默认是 form,会进行 urlencode,并拼接到请求 URL 上
        post_type = kwargs.get('post_type', 'form')
        if data is not None:
            if isinstance(data, dict):
                if post_type == 'json':
                    data_str = json.dumps(data)
                else:
                    # data = {"name":"meetbill", "age":"21"}  ==> urlencode(data) = 'age=21&name=meetbill'
                    data_str = urlencode(data)

            if not isinstance(data_str, basestring):
                raise ValueError('data must be string or dict')
        else:
            data_str = None

        request_type = kwargs.get('type', 'POST')
        if data_str and isinstance(
                request_type, basestring) and request_type.upper() != 'POST':
            # 如果是 GET 请求,则将 data 中的内容转为 url 的一部分
            url = '{}?{}'.format(url, data_str)
            data_str = None  # GET data must be None

        self.request = urlRequest(url, data_str)
        # Content-type, 默认是 'application/x-www-form-urlencoded'
        if request_type.upper() == 'POST' and post_type == "json":
            self.request.add_header('Content-type', 'application/json')

        # referer
        referer = kwargs.get('referer', None)
        if referer:
            self.request.add_header('referer', referer)

        # user-agent
        user_agent = kwargs.get('user_agent', None)
        if user_agent:
            self.request.add_header('User-Agent', user_agent)

        # auth
        auth = kwargs.get('auth', None)
        if auth and isinstance(auth, dict) and 'usr' in auth:
            auth_string = base64.b64encode('{}:{}'.format(
                auth.get('usr', ''), auth.get('pwd', '')))
            self.request.add_header('Authorization',
                                    'Basic {}'.format(auth_string))

        # cookie
        cookie = kwargs.get('cookie', None)
        cj = None
        if cookie:
            if isinstance(cookie, CookieJar):
                cj = cookie
            elif isinstance(cookie, dict):
                result = []
                for k, v in cookie.items():
                    result.append('{}={}'.format(k, v))
                cookie = '; '.join(result)
            elif isinstance(cookie, Cookie.BaseCookie):
                cookie = cookie.output(header='')
            if isinstance(cookie, basestring):
                self.request.add_header('Cookie', cookie)

        if cj is None:
            cj = CookieJar()

        #! TODO: proxy

        # build opener
        debuglevel = 1 if kwargs.get('debug', False) else 0
        opener = build_opener(HTTPHandler(debuglevel=debuglevel),
                              HTTPSHandler(debuglevel=debuglevel),
                              HTTPCookieProcessor(cj))

        # timeout
        timeout = kwargs.get('timeout')
        if not isinstance(timeout, int):
            timeout = _DEFAULT_TIMEOUT

        t_beginning = time.time()
        try:
            # opener.open accept a URL or a Request object
            # 程序中判断是字符串时按照 URL 来处理, 否则按照是已经封装好的 Request 处理
            self.response = opener.open(self.request, timeout=timeout)
            self.code = self.response.getcode()
            self.header = self.response.info().dict
            self.cookieJar = cj
            self.content = self.response.read()
            # 进行将 response 转为 dict
            if self.is_decode_response:
                self.content_dict = json.loads(self.content)

                # 检查 response 内容是否符合预期
                check_key = kwargs.get('check_key', None)
                check_value = kwargs.get('check_value', None)
                if check_key is not None and check_value is not None:
                    # 检查 check_value 类型
                    if isinstance(check_value, list):
                        if self.content_dict[check_key] not in check_value:
                            self.code = -1
                            self.reason = "[response not match: {response_value} not in {check_value}]".format(
                                response_value=self.content_dict[check_key],
                                check_value=check_value)
                    elif self.content_dict[check_key] != check_value:
                        self.code = -1
                        self.reason = "[response not match: {response_value} != {check_value}]".format(
                            response_value=self.content_dict[check_key],
                            check_value=check_value)
        except HTTPError as e:
            self.code = e.code
            self.reason = '{}'.format(e)
        except URLError as e:
            self.code = -1
            self.reason = e.reason
        except Exception as e:
            self.code = -1
            self.reason = '{}'.format(e)

        seconds_passed = time.time() - t_beginning
        cost_str = "%.6f" % seconds_passed

        # 打印日志
        f = inspect.currentframe().f_back
        file_name, lineno, func_name = self._get_backframe_info(f)

        log_msg = ("[file={file_name}:{func_name}:{lineno} "
                   "type=http_{method} "
                   "req_path={req_path} "
                   "req_data={req_data} "
                   "cost={cost} "
                   "is_success={is_success} "
                   "err_no={err_no} "
                   "err_msg={err_msg} "
                   "res_len={res_len} "
                   "res_data={res_data} "
                   "res_attr={res_attr}]".format(file_name=file_name,
                                                 func_name=func_name,
                                                 lineno=lineno,
                                                 method=request_type,
                                                 req_path=url,
                                                 req_data=data,
                                                 cost=cost_str,
                                                 is_success=self.success(),
                                                 err_no=self.code,
                                                 err_msg=self.reason,
                                                 res_len=len(self.content),
                                                 res_data=self.content,
                                                 res_attr=json.dumps(
                                                     self.header)))

        if self.success():
            log.info(log_msg)
        else:
            log.error(log_msg)
示例#6
0
文件: regex.py 项目: 17Q/modules4all
def getRegexParsed(
        regexs,
        url,
        cookieJar=None,
        forCookieJarOnly=False,
        recursiveCall=False,
        cachedPages={},
        rawPost=False,
        cookie_jar_file=None):  # 0,1,2 = URL, regexOnly, CookieJarOnly
    # cachedPages = {}
    # print 'url',url
    doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url)
    #        print 'doRegexs',doRegexs,regexs
    setresolved = True
    for k in doRegexs:
        if k in regexs:
            # print 'processing ' ,k
            m = regexs[k]
            # print m
            cookieJarParam = False
            if 'cookiejar' in m:  # so either create or reuse existing jar
                # print 'cookiejar exists',m['cookiejar']
                cookieJarParam = m['cookiejar']
                if '$doregex' in cookieJarParam:
                    cookieJar = getRegexParsed(regexs, m['cookiejar'],
                                               cookieJar, True, True,
                                               cachedPages)
                    cookieJarParam = True
                else:
                    cookieJarParam = True
            # print 'm[cookiejar]',m['cookiejar'],cookieJar
            if cookieJarParam:
                if cookieJar is None:
                    # print 'create cookie jar'
                    cookie_jar_file = None
                    if 'open[' in m['cookiejar']:
                        cookie_jar_file = m['cookiejar'].split(
                            'open[')[1].split(']')[0]
                    #                            print 'cookieJar from file name',cookie_jar_file

                    cookieJar = getCookieJar(cookie_jar_file)
                    #                        print 'cookieJar from file',cookieJar
                    if cookie_jar_file:
                        saveCookieJar(cookieJar, cookie_jar_file)
                    # import cookielib
                    # cookieJar = cookielib.LWPCookieJar()
                    # print 'cookieJar new',cookieJar
                elif 'save[' in m['cookiejar']:
                    cookie_jar_file = m['cookiejar'].split('save[')[1].split(
                        ']')[0]
                    complete_path = os.path.join(profile, cookie_jar_file)
                    #                        print 'complete_path',complete_path
                    saveCookieJar(cookieJar, cookie_jar_file)

            if m['page'] and '$doregex' in m['page']:
                pg = getRegexParsed(regexs,
                                    m['page'],
                                    cookieJar,
                                    recursiveCall=True,
                                    cachedPages=cachedPages)
                if len(pg) == 0:
                    pg = 'http://regexfailed'
                m['page'] = pg

            if 'setcookie' in m and m['setcookie'] and '$doregex' in m[
                    'setcookie']:
                m['setcookie'] = getRegexParsed(regexs,
                                                m['setcookie'],
                                                cookieJar,
                                                recursiveCall=True,
                                                cachedPages=cachedPages)
            if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[
                    'appendcookie']:
                m['appendcookie'] = getRegexParsed(regexs,
                                                   m['appendcookie'],
                                                   cookieJar,
                                                   recursiveCall=True,
                                                   cachedPages=cachedPages)

            if 'post' in m and '$doregex' in m['post']:
                m['post'] = getRegexParsed(regexs,
                                           m['post'],
                                           cookieJar,
                                           recursiveCall=True,
                                           cachedPages=cachedPages)
            #                    print 'post is now',m['post']

            if 'rawpost' in m and '$doregex' in m['rawpost']:
                m['rawpost'] = getRegexParsed(regexs,
                                              m['rawpost'],
                                              cookieJar,
                                              recursiveCall=True,
                                              cachedPages=cachedPages,
                                              rawPost=True)
                # print 'rawpost is now',m['rawpost']

            if 'rawpost' in m and '$epoctime$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime$',
                                                    getEpocTime())

            if 'rawpost' in m and '$epoctime2$' in m['rawpost']:
                m['rawpost'] = m['rawpost'].replace('$epoctime2$',
                                                    getEpocTime2())

            link = ''
            if m['page'] and m[
                    'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly is False:
                # print 'using cache page',m['page']
                link = cachedPages[m['page']]
            else:
                if m['page'] and not m['page'] == '' and m['page'].startswith(
                        'http'):
                    if '$epoctime$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime$',
                                                      getEpocTime())
                    if '$epoctime2$' in m['page']:
                        m['page'] = m['page'].replace('$epoctime2$',
                                                      getEpocTime2())

                    # print 'Ingoring Cache',m['page']
                    page_split = m['page'].split('|')
                    pageUrl = page_split[0]
                    header_in_page = None
                    if len(page_split) > 1:
                        header_in_page = page_split[1]

                    #                            if
                    #                            proxy = ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse})
                    #                            opener = build_opener(proxy)
                    #                            install_opener(opener)

                    #                        print 'getproxies',getproxies()
                    current_proxies = ProxyHandler(getproxies())

                    # print 'getting pageUrl',pageUrl
                    req = Request(pageUrl)
                    if 'proxy' in m:
                        proxytouse = m['proxy']
                        #                            print 'proxytouse',proxytouse
                        #                            getproxies= lambda: {}
                        if pageUrl[:5] == "https":
                            proxy = ProxyHandler({'https': proxytouse})
                            # req.set_proxy(proxytouse, 'https')
                        else:
                            proxy = ProxyHandler({'http': proxytouse})
                            # req.set_proxy(proxytouse, 'http')
                        opener = build_opener(proxy)
                        install_opener(opener)

                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'
                    )
                    proxytouse = None

                    if 'referer' in m:
                        req.add_header('Referer', m['referer'])
                    if 'accept' in m:
                        req.add_header('Accept', m['accept'])
                    if 'agent' in m:
                        req.add_header('User-agent', m['agent'])
                    if 'x-req' in m:
                        req.add_header('X-Requested-With', m['x-req'])
                    if 'x-addr' in m:
                        req.add_header('x-addr', m['x-addr'])
                    if 'x-forward' in m:
                        req.add_header('X-Forwarded-For', m['x-forward'])
                    if 'setcookie' in m:
                        #                            print 'adding cookie',m['setcookie']
                        req.add_header('Cookie', m['setcookie'])
                    if 'appendcookie' in m:
                        #                            print 'appending cookie to cookiejar',m['appendcookie']
                        cookiestoApend = m['appendcookie']
                        cookiestoApend = cookiestoApend.split(';')
                        for h in cookiestoApend:
                            n, v = h.split('=')
                            w, n = n.split(':')
                            ck = cookielib.Cookie(version=0,
                                                  name=n,
                                                  value=v,
                                                  port=None,
                                                  port_specified=False,
                                                  domain=w,
                                                  domain_specified=False,
                                                  domain_initial_dot=False,
                                                  path='/',
                                                  path_specified=True,
                                                  secure=False,
                                                  expires=None,
                                                  discard=True,
                                                  comment=None,
                                                  comment_url=None,
                                                  rest={'HttpOnly': None},
                                                  rfc2109=False)
                            cookieJar.set_cookie(ck)
                    if 'origin' in m:
                        req.add_header('Origin', m['origin'])
                    if header_in_page:
                        header_in_page = header_in_page.split('&')
                        for h in header_in_page:
                            n, v = h.split('=')
                            req.add_header(n, v)

                    if cookieJar is not None:
                        #                            print 'cookieJarVal',cookieJar
                        cookie_handler = HTTPCookieProcessor(cookieJar)
                        opener = build_opener(cookie_handler,
                                              HTTPBasicAuthHandler(),
                                              HTTPHandler())
                        opener = install_opener(opener)
                        #                            print 'noredirect','noredirect' in m

                        if 'noredirect' in m:
                            opener = build_opener(cookie_handler,
                                                  NoRedirection,
                                                  HTTPBasicAuthHandler(),
                                                  HTTPHandler())
                            opener = install_opener(opener)
                    elif 'noredirect' in m:
                        opener = build_opener(NoRedirection,
                                              HTTPBasicAuthHandler(),
                                              HTTPHandler())
                        opener = install_opener(opener)

                    if 'connection' in m:
                        #                            print '..........................connection//////.',m['connection']
                        from keepalive import HTTPHandler
                        keepalive_handler = HTTPHandler()
                        opener = build_opener(keepalive_handler)
                        install_opener(opener)

                    # print 'after cookie jar'
                    post = None

                    if 'post' in m:
                        postData = m['post']
                        # if '$LiveStreamRecaptcha' in postData:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #        postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield)
                        splitpost = postData.split(',')
                        post = {}
                        for p in splitpost:
                            n = p.split(':')[0]
                            v = p.split(':')[1]
                            post[n] = v
                        post = urlencode(post)

                    if 'rawpost' in m:
                        post = m['rawpost']
                        # if '$LiveStreamRecaptcha' in post:
                        #    (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar)
                        #    if captcha_challenge:
                        #       post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield)
                    link = ''
                    try:

                        if post:
                            response = urlopen(req, post)
                        else:
                            response = urlopen(req)
                        if response.info().get('Content-Encoding') == 'gzip':
                            from StringIO import StringIO
                            import gzip
                            buf = StringIO(response.read())
                            f = gzip.GzipFile(fileobj=buf)
                            link = f.read()
                        else:
                            link = response.read()

                        if 'proxy' in m and not current_proxies is None:
                            install_opener(build_opener(current_proxies))

                        link = javascriptUnEscape(link)
                        # print repr(link)
                        # print link This just print whole webpage in LOG
                        if 'includeheaders' in m:
                            # link+=str(response.headers.get('Set-Cookie'))
                            link += '$$HEADERS_START$$:'
                            for b in response.headers:
                                link += b + ':' + response.headers.get(
                                    b) + '\n'
                            link += '$$HEADERS_END$$:'
                        #                        print link

                        response.close()
                    except:
                        pass
                    cachedPages[m['page']] = link
                    # print link
                    # print 'store link for',m['page'],forCookieJarOnly

                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                elif m['page'] and not m['page'].startswith('http'):
                    if m['page'].startswith('$pyFunction:'):
                        val = doEval(m['page'].split('$pyFunction:')[1], '',
                                     cookieJar, m)
                        if forCookieJarOnly:
                            return cookieJar  # do nothing
                        link = val
                        link = javascriptUnEscape(link)
                    else:
                        link = m['page']

            if '$doregex' in m['expres']:
                m['expres'] = getRegexParsed(regexs,
                                             m['expres'],
                                             cookieJar,
                                             recursiveCall=True,
                                             cachedPages=cachedPages)

            if not m['expres'] == '':
                # print 'doing it ',m['expres']
                if '$LiveStreamCaptcha' in m['expres']:
                    val = askCaptcha(m, link, cookieJar)
                    # print 'url and val',url,val
                    url = url.replace("$doregex[" + k + "]", val)

                elif m['expres'].startswith(
                        '$pyFunction:') or '#$pyFunction' in m['expres']:
                    # print 'expeeeeeeeeeeeeeeeeeee',m['expres']
                    val = ''
                    if m['expres'].startswith('$pyFunction:'):
                        val = doEval(m['expres'].split('$pyFunction:')[1],
                                     link, cookieJar, m)
                    else:
                        val = doEvalFunction(m['expres'], link, cookieJar, m)
                    if 'ActivateWindow' in m['expres']: return
                    if forCookieJarOnly:
                        return cookieJar  # do nothing
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        return listrepeat, eval(val), m, regexs, cookieJar

                    try:
                        url = url.replace(u"$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                else:
                    if 'listrepeat' in m:
                        listrepeat = m['listrepeat']
                        ret = re.findall(m['expres'], link)
                        return listrepeat, ret, m, regexs

                    val = ''
                    if not link == '':
                        # print 'link',link
                        reg = re.compile(m['expres']).search(link)
                        try:
                            val = reg.group(1).strip()
                        except:
                            traceback.print_exc()
                    elif m['page'] == '' or m['page'] is None:
                        val = m['expres']

                    if rawPost:
                        #                            print 'rawpost'
                        val = quote_plus(val)
                    if 'htmlunescape' in m:
                        # val=unquote_plus(val)
                        try:
                            from HTMLParser import HTMLParser
                        except ImportError:
                            from html.parser import HTMLParser
                        val = HTMLParser().unescape(val)
                    try:
                        url = url.replace("$doregex[" + k + "]", val)
                    except:
                        url = url.replace("$doregex[" + k + "]",
                                          val.decode("utf-8"))
                    # print 'ur',url
                    # return val
            else:
                url = url.replace("$doregex[" + k + "]", '')
    if '$epoctime$' in url:
        url = url.replace('$epoctime$', getEpocTime())
    if '$epoctime2$' in url:
        url = url.replace('$epoctime2$', getEpocTime2())

    if '$GUID$' in url:
        import uuid
        url = url.replace('$GUID$', str(uuid.uuid1()).upper())
    if '$get_cookies$' in url:
        url = url.replace('$get_cookies$', getCookiesString(cookieJar))

    if recursiveCall: return url
    # print 'final url',repr(url)
    if url == "":
        return
    else:
        return url, setresolved
示例#7
0
    def __init__(self):
        """ Initialisation de AdeConnectionUtil
		Cette méthode permet de construire le connecteur de site et le
		gestionnaire de cookie."""
        self.CJ = CookieJar()
        self.connection = build_opener(HTTPCookieProcessor(self.CJ))
示例#8
0
    def _get_cookie(self, netloc, ua, timeout):
        class NoRedirection(HTTPErrorProcessor):
            def http_response(self, request, response):
                return response

        def parseJSString(s):
            try:
                offset = 1 if s[0] == '+' else 0
                val = int(
                    eval(s.replace('!+[]', '1').replace('!![]', '1').replace('[]', '0').replace('(', 'str(')[offset:]))
                return val
            except:
                pass

        cookies = cookielib.LWPCookieJar()
        opener = build_opener(NoRedirection, HTTPCookieProcessor(cookies))
        opener.addheaders = [('User-Agent', ua)]
        try:
            response = opener.open(netloc, timeout=int(timeout))
            result = response.read()
        except HTTPError as response:
            result = response.read()
            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
        jschl = re.compile('name="jschl_vc" value="(.+?)"/>').findall(result)[0]
        init = re.compile('setTimeout\(function\(\){\s*.*?.*:(.*?)};').findall(result)[0]
        builder = re.compile(r"challenge-form\'\);\s*(.*)a.v").findall(result)[0]
        if '/' in init:
            init = init.split('/')
            decryptVal = parseJSString(init[0]) / float(parseJSString(init[1]))
        else:
            decryptVal = parseJSString(init)
        lines = builder.split(';')
        for line in lines:
            if len(line) > 0 and '=' in line:
                sections = line.split('=')
                if '/' in sections[1]:
                    subsecs = sections[1].split('/')
                    line_val = parseJSString(subsecs[0]) / float(parseJSString(subsecs[1]))
                else:
                    line_val = parseJSString(sections[1])
                decryptVal = float(eval('%.16f' % decryptVal + sections[0][-1] + '%.16f' % line_val))
        answer = float('%.10f' % decryptVal) + len(urlparse.urlparse(netloc).netloc)
        query = '%scdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (netloc, jschl, answer)
        if 'type="hidden" name="pass"' in result:
            passval = re.findall('name="pass" value="(.*?)"', result)[0]
            query = '%scdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % (
                netloc, quote_plus(passval), jschl, answer)
            time.sleep(6)
        opener.addheaders = [('User-Agent', ua),
                             ('Referer', netloc),
                             ('Accept', 'text/html, application/xhtml+xml, application/xml, */*'),
                             ('Accept-Encoding', 'gzip, deflate')]
        response = opener.open(query)
        response.close()
        cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
        if 'cf_clearance' in cookie: self.cookie = cookie
示例#9
0
def request(url, close=True, redirect=True, error=False, verify=True, proxy=None, post=None, headers=None, mobile=False,
            XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30'):
    try:
        if not url:
            return
        handlers = []
        if proxy is not None:
            handlers += [ProxyHandler({'http': '%s' % (proxy)}), HTTPHandler]
            opener = build_opener(*handlers)
            opener = install_opener(opener)
        if output == 'cookie' or output == 'extended' or not close is True:
            cookies = cookielib.LWPCookieJar()
            handlers += [HTTPHandler(), HTTPSHandler(), HTTPCookieProcessor(cookies)]
            opener = build_opener(*handlers)
            opener = install_opener(opener)
        try:
            import platform
            node = platform.node().lower()
            is_XBOX = platform.uname()[1] == 'XboxOne'
        except Exception:
            node = ''
            is_XBOX = False
        if verify is False and sys.version_info >= (2, 7, 12):
            try:
                import ssl
                ssl_context = ssl._create_unverified_context()
                handlers += [HTTPSHandler(context=ssl_context)]
                opener = build_opener(*handlers)
                opener = install_opener(opener)
            except:
                pass
        if verify is True and ((2, 7, 8) < sys.version_info < (2, 7, 12) or is_XBOX):
            try:
                import ssl
                ssl_context = ssl.create_default_context()
                ssl_context.check_hostname = False
                ssl_context.verify_mode = ssl.CERT_NONE
                handlers += [HTTPSHandler(context=ssl_context)]
                opener = build_opener(*handlers)
                opener = install_opener(opener)
            except:
                pass
        if url.startswith('//'): url = 'http:' + url
        _headers = {}
        try:
            _headers.update(headers)
        except:
            pass
        if 'User-Agent' in _headers:
            pass
        elif mobile is True:
            _headers['User-Agent'] = Database.get(randommobileagent, 1)
        else:
            _headers['User-Agent'] = Database.get(randomagent, 1)
        if 'Referer' in _headers:
            pass
        elif referer is not None:
            _headers['Referer'] = referer
        if not 'Accept-Language' in _headers:
            _headers['Accept-Language'] = 'en-US'
        if 'X-Requested-With' in _headers:
            pass
        elif XHR is True:
            _headers['X-Requested-With'] = 'XMLHttpRequest'
        if 'Cookie' in _headers:
            pass
        elif cookie is not None:
            _headers['Cookie'] = cookie
        if 'Accept-Encoding' in _headers:
            pass
        elif compression and limit is None:
            _headers['Accept-Encoding'] = 'gzip'
        if redirect is False:
            class NoRedirectHandler(urllib2.HTTPRedirectHandler):
                def http_error_302(self, req, fp, code, msg, headers):
                    infourl = urllib.addinfourl(fp, headers, req.get_full_url())
                    infourl.status = code
                    infourl.code = code
                    return infourl

                http_error_300 = http_error_302
                http_error_301 = http_error_302
                http_error_303 = http_error_302
                http_error_307 = http_error_302

            opener = urllib2.build_opener(NoRedirectHandler())
            opener = install_opener(opener)
            try:
                del _headers['Referer']
            except:
                pass
        if isinstance(post, dict):
            post = utils.byteify(post)
            post = urlencode(post)
        url = utils.byteify(url)
        request = Request(url, data=post)
        _add_request_header(request, _headers)
        try:
            response = urlopen(request, timeout=int(timeout))
        except HTTPError as response:
            if response.code == 503:
                cf_result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    cf_result = gzip.GzipFile(fileobj=StringIO(cf_result)).read()
                if 'cf-browser-verification' in cf_result:
                    while 'cf-browser-verification' in cf_result:
                        netloc = '%s://%s/' % (urlparse(url).scheme, urlparse(url).netloc)
                        ua = _headers['User-Agent']
                        cf = Database.get(cfcookie().get, 1, netloc, ua, timeout)
                        _headers['Cookie'] = cf
                        request = Request(url, data=post)
                        _add_request_header(request, _headers)
                        try:
                            response = urlopen(request, timeout=int(timeout))
                            cf_result = 'Success'
                        except HTTPError as response:
                            Database.remove(cfcookie().get, netloc, ua, timeout)
                            cf_result = response.read()
                else:
                    controlo.log('Request-Error (%s): %s' % (str(response.code), url))
                    if error is False:
                        return
            else:
                controlo.log('Request-Error (%s): %s' % (str(response.code), url))
                if error is False:
                    return
        if output == 'cookie':
            try:
                result = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                result = cf
            except:
                pass
            if close is True:
                response.close()
            return result
        elif output == 'geturl':
            result = response.geturl()
            if close is True: response.close()
            return result
        elif output == 'headers':
            result = response.headers
            if close is True: response.close()
            return result
        elif output == 'chunk':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = (2049 * 1024)
            if content < (2048 * 1024): return
            result = response.read(16 * 1024)
            if close is True: response.close()
            return result
        elif output == 'file_size':
            try:
                content = int(response.headers['Content-Length'])
            except:
                content = '0'
            response.close()
            return content
        if limit == '0':
            result = response.read(224 * 1024)
        elif limit is not None:
            result = response.read(int(limit) * 1024)
        else:
            result = response.read(5242880)
        try:
            encoding = response.info().getheader('Content-Encoding')
        except:
            encoding = None
        if encoding == 'gzip':
            result = gzip.GzipFile(fileobj=StringIO(result)).read()
        if 'sucuri_cloudproxy_js' in result:
            su = sucuri().get(result)
            _headers['Cookie'] = su
            request = Request(url, data=post)
            _add_request_header(request, _headers)
            response = urlopen(request, timeout=int(timeout))
            if limit == '0':
                result = response.read(224 * 1024)
            elif limit is not None:
                result = response.read(int(limit) * 1024)
            else:
                result = response.read(5242880)
            try:
                encoding = response.info().getheader('Content-Encoding')
            except:
                encoding = None
            if encoding == 'gzip':
                result = gzip.GzipFile(fileobj=StringIO(result)).read()
        if 'Blazingfast.io' in result and 'xhr.open' in result:
            netloc = '%s://%s' % (urlparse(url).scheme, urlparse(url).netloc)
            ua = _headers['User-Agent']
            _headers['Cookie'] = Database.get(bfcookie().get, 168, netloc, ua, timeout)
            result = _basic_request(url, headers=_headers, post=post, timeout=timeout, limit=limit)
        if output == 'extended':
            try:
                response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()])
            except:
                response_headers = response.headers
            response_code = str(response.code)
            try:
                cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])
            except:
                pass
            try:
                cookie = cf
            except:
                pass
            if close is True: response.close()
            return (result, response_code, response_headers, _headers, cookie)
        else:
            if close is True: response.close()
            return result
    except Exception as e:
        controlo.log('Request-Error: (%s) => %s' % (str(e), url))
        return
示例#10
0
def crawl_author():
    """
  Crawls Google Scholar in order to retrieve information about an author.
  """

    # The ID of the author in Google Scholar.
    scholar_id = request.form['scholar_id']

    print 'Crawl author ' + scholar_id + '.'

    # Retrieve the author with that ID (if any).
    author = Author.query.filter_by(scholar_id=scholar_id).first()
    if author is None:
        author = Author()

    cookie_jar = CookieJar()
    opener = build_opener(HTTPCookieProcessor(cookie_jar))
    install_opener(opener)

    url = 'https://scholar.google.com/citations'
    params = urlencode({
        'hl': 'en',
        'view_op': 'list_works',
        'sortby': 'pubdate',
        'user': scholar_id,
        'cstart': 0,
        'pagesize': 20
    })
    req = Request(url + '?' + params)
    opener.open(req)
    res = opener.open(req)
    doc = html.parse(res)

    no_content = doc.xpath(
        './/div[contains(text(), "Sorry, no content found for this URL")]')
    if len(no_content):
        print 'Author ' + scholar_id + ' not found.'
        return 'Done.'

    author.scholar_id = scholar_id

    nname = doc.find('.//div[@id="gsc_prf_in"]')
    if nname is not None:

        # The name of the author.
        author.name = nname.text_content()

    nemaildomain = doc.find('.//div[@id="gsc_prf_ivh"]')
    if nemaildomain is not None:

        # The domain where the author has an email.
        author.email_domain = nemaildomain.text_content().split(
            " - ")[0].split()[-1]

    ncitations = doc.find('.//table[@id="gsc_rsb_st"]')
    if ncitations is not None:

        # The total citations for the author.
        author.total_citations = ncitations.xpath('.//tr[2]/td')[1].text

        # The h-index for the author.
        author.h_index = ncitations.xpath('.//tr[3]/td')[1].text

        # The i10-index for the author.
        author.i10_index = ncitations.xpath('.//tr[4]/td')[1].text

    params = urlencode({
        'hl': 'en',
        'view_op': 'citations_histogram',
        'user': scholar_id
    })
    req = Request(url + '?' + params)
    opener.open(req)
    res = opener.open(req)
    doc = html.parse(res)

    # The citations per year for the author.
    author_citations_per_year = []
    nhistogram = doc.find('.//div[@id="gsc_md_hist_b"]')
    if nhistogram is not None:
        years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')]
        for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'):
            i = int(a.get('style').split('z-index:')[1])
            year = int(years[-i])
            citations_per_year = AuthorCitationsPerYear.query.filter_by(
                author_id=author.id, year=year).first()
            if citations_per_year is None:
                citations_per_year = AuthorCitationsPerYear()
            citations_per_year.year = int(years[-i])
            citations_per_year.citations = int(
                a.xpath('./span[@class="gsc_g_al"]')[0].text)
            author_citations_per_year.append(citations_per_year)
    author.citations_per_year = author_citations_per_year

    params = urlencode({
        'hl': 'en',
        'view_op': 'list_colleagues',
        'user': scholar_id
    })
    req = Request(url + '?' + params)
    opener.open(req)
    res = opener.open(req)
    doc = html.parse(res)

    # The co-authors of the author.
    author_coauthors = []
    for a in doc.xpath('.//h3[@class="gsc_1usr_name"]//a'):
        co_scholar_id = a.get('href').split('user='******'&hl')[0]
        coauthor = Author.query.filter_by(scholar_id=co_scholar_id).first()
        if coauthor is None:
            coauthor = Author()
        coauthor.scholar_id = co_scholar_id
        author_coauthors.append(coauthor)
    author.coauthors = author_coauthors

    # The publications.
    author_publications = []
    cstart = 0
    pagesize = 100
    while True:
        params = urlencode({
            'hl': 'en',
            'view_op': 'list_works',
            'sortby': 'pubdate',
            'user': scholar_id,
            'cstart': cstart,
            'pagesize': pagesize
        })
        req = Request(url + '?' + params)
        opener.open(req)
        res = opener.open(req)
        doc = html.parse(res)

        for tr in doc.xpath('.//tr[@class="gsc_a_tr"]'):
            a = tr.find('.//td[@class="gsc_a_t"]//a')
            # NOTE: When there are no publications, there is a single tr.
            # <tr class="gsc_a_tr"><td class="gsc_a_e" colspan="3">There are no articles in this profile.</td></tr>
            if a is None:
                continue
            purl = a.get('href')

            # The ID of the publication in Google Scholar.
            pub_scholar_id = purl.split('citation_for_view=')[1]

            # Retrieve the publication with that ID (if any).
            publication = Publication.query.filter_by(
                scholar_id=pub_scholar_id).first()
            if publication is None:
                publication = Publication()
            publication.scholar_id = pub_scholar_id

            # The title of the publication.
            publication.title = a.text_content()

            pub_nyear = tr.find('.//td[@class="gsc_a_y"]//span')
            if pub_nyear is not None:
                year_of_publication = pub_nyear.text_content().strip()
                if year_of_publication:

                    # The year of the publication.
                    publication.year_of_publication = int(year_of_publication)

            pub_ncitations = tr.find('.//a[@class="gsc_a_ac"]')

            if pub_ncitations is not None:
                total_citations = pub_ncitations.text_content().strip()
                if total_citations:

                    # The total citations for the publication.
                    publication.total_citations = int(total_citations)

            author_publications.append(publication)

        if doc.xpath('.//button[@id="gsc_bpf_next"]')[0].get("disabled"):
            break

        cstart += 100
    author.publications = author_publications

    # When information about the author was retrieved from Google Scholar.
    author.retrieved_at = datetime.datetime.now()

    db.session.add(author)
    db.session.commit()

    print 'Crawled author ' + scholar_id + '.'
    return 'Done.'
示例#11
0
def crawl_publication():
    """
  Crawls Google Scholar in order to retrieve information about a publication.
  """

    # The ID of the publication in Google Scholar.
    scholar_id = request.form['scholar_id']

    print 'Crawl publication ' + scholar_id + '.'

    url = 'https://scholar.google.com/citations'

    publication = Publication.query.filter_by(scholar_id=scholar_id).first()
    if publication is None:
        publication = Publication()

    cookie_jar = CookieJar()
    opener = build_opener(HTTPCookieProcessor(cookie_jar))
    install_opener(opener)

    url = 'https://scholar.google.com/citations'
    params = urlencode({
        'hl': 'en',
        'view_op': 'view_citation',
        'citation_for_view': scholar_id
    })
    req = Request(url + '?' + params)
    opener.open(req)
    res = opener.open(req)
    doc = html.parse(res)

    publication.scholar_id = scholar_id

    ntitle = doc.find('.//a[@class="gsc_title_link"]')
    if ntitle is not None:

        # The title of the publication.
        publication.title = ntitle.text_content()

    ntype = doc.find('.//div[@class="gs_scl"][3]//div[@class="gsc_field"]')
    if ntype is not None:

        # The type of the publication.
        publication.type = ntype.text_content()
        if publication.type == 'Description':
            publication.type = 'Other'

    nyear = doc.xpath(
        './/div[text()="Publication date"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]'
    )
    if nyear is not None and len(nyear):

        # The year of the publication.
        publication.year_of_publication = int(nyear[0].text.split('/')[0])

    ncitations = doc.xpath(
        './/div[text()="Total citations"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]//a'
    )
    if ncitations is not None and len(ncitations):

        # The total citations for the publication.
        publication.total_citations = ncitations[0].text.split(' ')[-1]

    nauthors = doc.xpath(
        './/div[text()="Authors"]/ancestor::div[@class="gs_scl"]//div[@class="gsc_value"]'
    )
    if nauthors is not None and len(nauthors):

        # The authors of the publication.
        publication.author_names = nauthors[0].text

    # The citations per year for the publication.
    publication_citations_per_year = []
    nhistogram = doc.find('.//div[@id="gsc_graph_bars"]')
    if nhistogram is not None:
        years = [x.text for x in nhistogram.xpath('.//span[@class="gsc_g_t"]')]
        for a in nhistogram.xpath('.//a[@class="gsc_g_a"]'):
            i = int(a.get('style').split('z-index:')[1])
            year = int(years[-i])
            citations_per_year = PublicationCitationsPerYear.query.filter_by(
                publication_id=publication.id, year=year).first()
            if citations_per_year is None:
                citations_per_year = PublicationCitationsPerYear()
            citations_per_year.year = int(years[-i])
            citations_per_year.citations = int(
                a.xpath('./span[@class="gsc_g_al"]')[0].text)
            publication_citations_per_year.append(citations_per_year)
    publication.citations_per_year = publication_citations_per_year

    # When information about the author was retrieved from Google Scholar.
    publication.retrieved_at = datetime.datetime.now()

    db.session.add(publication)
    db.session.commit()

    print 'Crawled publication ' + scholar_id + '.'
    return 'Done.'
示例#12
0
info = rootLogger.info
warn = rootLogger.warning
debug = rootLogger.debug
error = rootLogger.error
log_exception = rootLogger.exception

# filepath constants
GAME_STORAGE_DIR = r'.'
COOKIES_FILENAME = r'gog-cookies.dat'
MANIFEST_FILENAME = r'gog-manifest.dat'
SERIAL_FILENAME = r'!serial.txt'
INFO_FILENAME = r'!info.txt'

# global web utilities
global_cookies = cookiejar.LWPCookieJar(COOKIES_FILENAME)
cookieproc = HTTPCookieProcessor(global_cookies)
opener = build_opener(cookieproc)
treebuilder = html5lib.treebuilders.getTreeBuilder('etree')
parser = html5lib.HTMLParser(tree=treebuilder, namespaceHTMLElements=False)

# GOG URLs
GOG_HOME_URL = r'https://www.gog.com'
GOG_ACCOUNT_URL = r'https://www.gog.com/account'
GOG_LOGIN_URL = r'https://login.gog.com/login_check'

# GOG Constants
GOG_MEDIA_TYPE_GAME = '1'
GOG_MEDIA_TYPE_MOVIE = '2'

# HTTP request settings
HTTP_FETCH_DELAY = 1  # in seconds
示例#13
0
 def __init__(self, user_agent=DEFAULT_USERAGENT, timeout=DEFAULT_TIMEOUT):
     self.cj = CookieJar()
     self.opener = build_opener(HTTPCookieProcessor(self.cj))
     self.urlopen = self.opener.open
     self.user_agent = user_agent
     self.timeout = timeout
示例#14
0
def _request(url, headers, post, cookies):
    log(url)
    url = quote_plus(url, safe='%/:?=&')
    if post:
        if sys.version_info[0] >= 3:  # for Python 3
            post = post.encode('utf-8')
        req = Request(url, post)
        log('########POST!')
    else:
        req = Request(url)
    if headers:
        for key in headers:
            req.add_header(key, headers[key])
        #req.add_header('Content-Type','application/json')
        req.has_header = lambda header_name: (
            True if header_name == 'Content-Length' else Request.has_header(
                req, header_name))
    else:
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0'
        )
        req.add_header('Accept-Encoding', 'gzip, deflate')

    if cookies:
        cj = CookieJar()
        log(pathUserdata(''))
        log(pathUserdata('cookies.txt'))
        if not f_exists(pathUserdata('')):
            f_mkdir(pathUserdata(''))
        if f_exists(pathUserdata('cookies.txt')):
            cookies_txt = f_open(pathUserdata('cookies.txt'))
            if cookies_txt:
                if sys.version_info[0] >= 3:  # for Python 3
                    if isinstance(cookies_txt, str):
                        cookies_txt = cookies_txt.encode('utf-8')
                c = pickle.loads(cookies_txt)
                for cookie in c:
                    cj.set_cookie(cookie)
        opener = build_opener(HTTPCookieProcessor(cj))
        response = opener.open(req)

        c = []
        for cookie in cj:
            log(str(cookie))
            c.append(cookie)
        log(str(cj))

        f_write(pathUserdata('cookies.txt'), pickle.dumps(c))
        #cj.save(cookiefile)
    else:
        response = urlopen(req)

    compressed = response.info().get('Content-Encoding') == 'gzip'
    link = response.read()
    response.close()
    if compressed:
        if sys.version_info[0] < 3:
            buf = StringIO(link)
        else:
            buf = BytesIO(link)
        f = gzip.GzipFile(fileobj=buf)
        link = f.read()
    if sys.version_info[0] >= 3:  # for Python 3
        link = link.decode('utf-8')
    return link
示例#15
0
文件: conexion.py 项目: sunzu/vot.ar
 def __get_http_opener(self):
     """ Devuelve una instancia del opener adecuado para interactuar vía
     https con client key y soporte de cookies """
     return build_opener(HTTPHandler(debuglevel=self.DEBUG_LEVEL),
                         HTTPCookieProcessor(self._cookiejar))
示例#16
0
def start(args):
    """Login and session handler
    """
    # create cookiejar
    args._cj = LWPCookieJar()

    # lets urllib handle cookies
    opener = build_opener(HTTPCookieProcessor(args._cj))
    opener.addheaders = [(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
    ), ("Accept-Encoding", "identity"), ("Accept", "*/*"),
                         ("Content-Type", "application/x-www-form-urlencoded"),
                         ("DNT", "1")]
    install_opener(opener)

    # load cookies
    try:
        args._cj.load(getCookiePath(args), ignore_discard=True)
    except IOError:
        # cookie file does not exist
        pass

    # get login informations
    username = args._addon.getSetting("crunchyroll_username")
    password = args._addon.getSetting("crunchyroll_password")

    # session management
    if not (args._session_id and args._auth_token):
        # create new session
        payload = {
            "device_id": args._device_id,
            "device_type": API.DEVICE,
            "access_token": API.TOKEN
        }
        req = request(args, "start_session", payload, True)

        # check for error
        if req["error"]:
            return False
        args._session_id = req["data"]["session_id"]

        # make login
        payload = {"password": password, "account": username}
        req = request(args, "login", payload, True)

        # check for error
        if req["error"]:
            return False
        args._auth_token = req["data"]["auth"]
    if not getattr(args, "_session_restart", False):
        pass
    else:
        # restart session
        payload = {
            "device_id": args._device_id,
            "device_type": API.DEVICE,
            "access_token": API.TOKEN,
            "auth": args._auth_token
        }
        req = request(args, "start_session", payload, True)

        # check for error
        if req["error"]:
            destroy(args)
            return False
        args._session_id = req["data"]["session_id"]
        args._auth_token = req["data"]["auth"]
        args._session_restart = False

    return True
示例#17
0
    def _request(self, chunk=None, info_request=False):
        """Do the request.

        Used for fetching information and for fetching data.

        chunk -- specifies which range (part) should be loaded.
        info_request -- specifies if only information should be fetched.
        """
        if self._response is not None:
            return self._response

        if self.url_parts.scheme == 'http':
            max_redirects = 0
            if info_request:
                # allow redirects only for info-requests
                max_redirects = self.source.max_redirects
            req = Request(self.url)

            cookie_processor = HTTPCookieProcessor()

            if self.source.cookie_objects is not None:
                # Use the cookies which were received by previous
                # (info-)requests.
                for cookie in self.source.cookie_objects:
                    cookie_processor.cookiejar.set_cookie(cookie)
            elif len(self.source.cookies) > 0 and info_request:
                # This is the first (info-)request where cookies are
                # used. Use user-defined cookies.
                fcres = FakeCookieResponse(self.source.cookies, self.url)
                cookie_processor.cookiejar.extract_cookies(fcres, req)

            if self.source.referrer != '':
                req.add_header('Referer', self.source.referrer)
            if self.source.user_agent != '':
                req.add_header('User-Agent', self.source.user_agent)

            if chunk is not None:
                start_offset = chunk.offset + chunk.loaded
                req.add_header('Range', 'bytes=' + str(start_offset) + '-')

            opener = build_opener(_LimitedHTTPRedirectHandler(max_redirects),
                                  cookie_processor)
            self._response = opener.open(req, timeout=self.source.timeout)

            if self.source.cookie_objects is None:
                # save cookie objects for later use (e.g. DataSlots)
                cookie_objects = []
                for cookie in cookie_processor.cookiejar:
                    cookie_objects.append(cookie)
                self.source.cookie_objects = cookie_objects

            return self._response

        elif self.url_parts.scheme == 'ftp':
            req = Request(self.url)
            if chunk is not None:
                start_offset = chunk.offset + chunk.loaded
                req.add_header('Offset', str(start_offset))
            opener = build_opener(FTPChunkHandler())
            self._response = opener.open(req, timeout=self.source.timeout)
            return self._response
        else:
            raise URLError('The protocol is not supported.')
示例#18
0
# content = "<p><b>Cookie</b></p><br />" + cookie + "<br />"


# # Write the response to a new HTML page
# with open('case06.html','w') as fileWriter:
#     fileWriter.write(content)

# url = "case06.html"

# # Open the response page in a new tab
# webbrowser.get('firefox').open_new_tab(url)

# Imports
from urllib2 import Request, build_opener, HTTPCookieProcessor, HTTPHandler
import cookielib

# CookieJar object to hold the cookies
cJar = cookielib.CookieJar()

# Open page
opener = build_opener(HTTPCookieProcessor(cJar), HTTPHandler())

# Request
req = Request("https://www.wsb.com/Assignment2/case06.php")
res = opener.open(req)

#Check out the cookies
print "Cookie\n"

for cookie in cJar:
    print cookie
示例#19
0
from urllib2 import HTTPCookieProcessor,build_opener
from cookielib import CookieJar,MozillaCookieJar

from redis_test import Redis


# 1. build a cookie with file name
# 2. create a cookie handler
# 3. build a opener
fileName = 'cookie.txt'
cookie = MozillaCookieJar(fileName)
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)


response = opener.open("http://www.baidu.com")
for item in cookie:
    print 'Name = ' + item.name
    print 'Value = ' + item.value

cookie.save(ignore_discard=True,ignore_expires=True)
示例#20
0
def main():
    """
    Initializes and executes the program.
    """

    login_sucessful = []
    login_failed = []
    login_skipped = []

    version = check_revision(VERSION)

    print("%s\n\n%s %s (%s)\n" %
          (BANNER % tuple([color(_)
                           for _ in BANNER_PASSWORDS]), NAME, version, URL))

    args = parse_args()

    if args.update:
        update()
        exit()

    sites = list_sites()

    if args.list:
        for _ in sites:
            print("- %s" % _)
        exit()

    if not args.password and not args.load_file:
        args.password = getpass("%s Please enter password:"******"(?P<type>[^:]+)://(?P<address>[^:]+)"
            r":(?P<port>\d+)", args.proxy, re.I)
        if match:
            if match.group("type").upper() in ("HTTP", "HTTPS"):
                proxy_host = "%s:%s" % (match.group("address"),
                                        match.group("port"))
                proxy_handler = ProxyHandler({
                    "http": proxy_host,
                    "https": proxy_host
                })
            else:
                from thirdparty.socks import socks
                if match.group("type").upper() == "SOCKS4":
                    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4,
                                          match.group("address"),
                                          int(match.group("port")), True)
                elif match.group("type").upper() == "SOCKS5":
                    socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5,
                                          match.group("address"),
                                          int(match.group("port")), True)
                proxy_handler = None
        else:
            proxy_handler = ProxyHandler()
    else:
        proxy_handler = None

    opener = build_opener(HTTPHandler(), HTTPSHandler(),
                          HTTPCookieProcessor(cookie_handler))
    if proxy_handler:
        opener.add_handler(proxy_handler)

    install_opener(opener)

    with open(USER_AGENTS_FILE, 'r') as ua_file:
        args.user_agent = sample(ua_file.readlines(), 1)[0].strip()

    if args.only:
        sites = [site for site in sites if site in args.only]
    elif args.exclude:
        sites = [site for site in sites if site not in args.exclude]

    print("%s Loaded %d %s to test." %
          (INFO, len(sites), "site" if len(sites) == 1 else "sites"))

    if args.load_file:
        if not isfile(args.load_file):
            print("%s could not find the file \"%s\"" %
                  (WARN, color(args.load_file)))
            exit()

        _ = sum(1 for line in open(args.load_file, "r"))
        if _ < 1:
            print("%s the file \"%s\" doesn't contain any valid credentials." %
                  (WARN, color(args.load_file)))
            exit()

        print("%s Loaded %d credential%s from \"%s\".\n" %
              (INFO, _, "s" if _ != 1 else "", color(args.load_file)))

    print("%s Starting tests at: \"%s\"\n" % (INFO, color(strftime("%X"), BW)))

    if not exists(OUTPUT_DIR):
        makedirs(OUTPUT_DIR)

    log = Logger("%s/credmap" % OUTPUT_DIR)
    log.open()

    def get_targets():
        """
        Retrieve and yield list of sites (targets) for testing.
        """
        for site in sites:
            _ = populate_site(site, args)
            if not _:
                continue
            target = Website(_, {"verbose": args.verbose})

            if not target.user_agent:
                target.user_agent = args.user_agent

            yield target

    def login():
        """
        Verify credentials for login and check if login was successful.
        """
        if (target.username_or_email == "email" and not credentials["email"]
                or target.username_or_email == "username"
                and not credentials["username"]):
            if args.verbose:
                print(
                    "%s Skipping %s\"%s\" since "
                    "no \"%s\" was specified.\n" %
                    (INFO, "[%s:%s] on " %
                     (credentials["username"] or credentials["email"],
                      credentials["password"]) if args.load_file else "",
                     color(target.name), color(target.username_or_email, BW)))
                login_skipped.append(target.name)
            return

        print("%s Testing %s\"%s\"..." %
              (TEST, "[%s:%s] on " %
               (credentials["username"] or credentials["email"],
                credentials["password"]) if args.load_file else "",
               color(target.name, BW)))

        cookie_handler.clear()

        if target.perform_login(credentials, cookie_handler):
            log.write(">>> %s - %s:%s\n" %
                      (target.name, credentials["username"]
                       or credentials["email"], credentials["password"]))
            login_sucessful.append(
                "%s%s" % (target.name, " [%s:%s]" %
                          (credentials["username"] or credentials["email"],
                           credentials["password"]) if args.load_file else ""))
        else:
            login_failed.append(target.name)

    if args.load_file:
        if args.cred_format:
            separators = [
                re.escape(args.cred_format[1]),
                re.escape(args.cred_format[3])
                if len(args.cred_format) > 3 else "\n"
            ]
            cred_format = re.match(r"(u|e|p)[^upe](u|e|p)(?:[^upe](u|e|p))?",
                                   args.cred_format)
            if not cred_format:
                print("%s Could not parse --format: \"%s\"" %
                      (ERROR, color(args.cred_format, BW)))
                exit()

            cred_format = [
                v.replace("e", "email").replace("u", "username").replace(
                    "p", "password") for v in cred_format.groups()
                if v is not None
            ]

        with open(args.load_file, "r") as load_list:
            for user in load_list:
                if args.cred_format:
                    match = re.match(
                        r"([^{0}]+){0}([^{1}]+)(?:{1}([^\n]+))?".format(
                            separators[0], separators[1]), user)
                    credentials = dict(zip(cred_format, match.groups()))
                    credentials["password"] = quote(credentials["password"])
                    if ("email" in credentials and not re.match(
                            r"^[A-Za-z0-9._%+-]+@(?:[A-Z"
                            r"a-z0-9-]+\.)+[A-Za-z]{2,12}$",
                            credentials["email"])):
                        print("%s Specified e-mail \"%s\" does not appear "
                              "to be correct. Skipping...\n" %
                              (WARN, color(credentials["email"], BW)))
                        continue

                    if "email" not in credentials:
                        credentials["email"] = None
                    elif "username" not in credentials:
                        credentials["username"] = None
                else:
                    user = user.rstrip().split(":", 1)
                    if not user[0]:
                        if args.verbose:
                            print("%s Could not parse credentials: \"%s\"\n" %
                                  (WARN, color(user, BW)))
                        continue

                    match = re.match(
                        r"^[A-Za-z0-9._%+-]+@(?:[A-Z"
                        r"a-z0-9-]+\.)+[A-Za-z]{2,12}$", user[0])
                    credentials = {
                        "email": user[0] if match else None,
                        "username": None if match else user[0],
                        "password": quote(user[1])
                    }

                for target in get_targets():
                    login()
    else:
        credentials = {
            "username": args.username,
            "email": args.email,
            "password": quote(args.password)
        }
        for target in get_targets():
            login()

    log.close()

    if not args.verbose:
        print()

    if len(login_sucessful) > 0 or len(login_failed) > 0:
        _ = "%s/%s" % (color(len(login_sucessful), BW),
                       color(len(login_sucessful) + len(login_failed), BW))
        sign = PLUS if len(login_sucessful) > (len(login_failed) +
                                               len(login_skipped)) else INFO
        print(
            "%s Succesfully logged in%s." %
            (sign, " with %s credentials on the list." %
             _ if args.load_file else "to %s websites." % _), )
        print("%s An overall success rate of %s.\n" %
              (sign,
               color(
                   "%%%s" % (100 * len(login_sucessful) /
                             (len(login_sucessful) + len(login_failed))), BW)))

    if len(login_sucessful) > 0:
        print("%s The provided credentials worked on the following website%s: "
              "%s\n" % (PLUS, "s" if len(login_sucessful) != 1 else "",
                        ", ".join(login_sucessful)))

    print("%s Finished tests at: \"%s\"\n" % (INFO, color(strftime("%X"), BW)))
示例#21
0
文件: client.py 项目: 17Q/modules4all
    def get_cookie(self, netloc, ua, timeout):
        try:
            headers = {'User-Agent': ua}
            request = Request(netloc)
            _add_request_header(request, headers)

            try:
                response = urlopen(request, timeout=int(timeout))
            except HTTPError as response:
                result = response.read(5242880)
                try:
                    encoding = response.info().getheader('Content-Encoding')
                except:
                    encoding = None
                if encoding == 'gzip':
                    result = gzip.GzipFile(fileobj=StringIO(result)).read()

            jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0]
            init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};',
                              result)[-1]
            builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0]
            decryptVal = self.parseJSString(init)
            lines = builder.split(';')

            for line in lines:
                if len(line) > 0 and '=' in line:
                    sections = line.split('=')
                    line_val = self.parseJSString(sections[1])
                    decryptVal = int(
                        eval(
                            str(decryptVal) + sections[0][-1] + str(line_val)))

            answer = decryptVal + len(urlparse(netloc).netloc)

            query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % (
                netloc, jschl, answer)

            if 'type="hidden" name="pass"' in result:
                passval = re.findall('name="pass" value="(.*?)"', result)[0]
                query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % (
                    netloc, quote_plus(passval), jschl, answer)
                time.sleep(6)

            cookies = cookielib.LWPCookieJar()
            handlers = [
                HTTPHandler(),
                HTTPSHandler(),
                HTTPCookieProcessor(cookies)
            ]
            opener = build_opener(*handlers)
            opener = install_opener(opener)

            try:
                request = Request(query)
                _add_request_header(request, headers)
                response = urlopen(request, timeout=int(timeout))
            except:
                pass

            cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies])

            if 'cf_clearance' in cookie:
                self.cookie = cookie
        except:
            pass
示例#22
0
except:  # in case of python 3.X
    import urllib.request as urllib2
    from urllib.request import HTTPRedirectHandler
    from urllib.request import HTTPCookieProcessor


class MyHTTPRedirectHandler(HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                  headers)

    http_error_301 = http_error_303 = http_error_307 = http_error_302


cookieprocessor = HTTPCookieProcessor()

opener = urllib2.build_opener(MyHTTPRedirectHandler, cookieprocessor)
urllib2.install_opener(opener)


class rss(Scraper):
    '''
    Reades a generic RSS feed and proceeds if items not already in collection.
    Retrieves full HTML content from link provided in RSS feed
    Yields docs with keys from RSS entry plus full HTML source of linked content.

    Subclasses should probably overwrite the following functions:
        By overwriting the parsehtml function, more keys can be extracted
        By overwriting the getlink function, modifications to the link can be made, e.g. to bypass cookie walls
    '''
示例#23
0
    def add_file(self, release, filename, folder):
        """
		release: the release name
		filename: path where to find the file on HD
		folder: the folder to the SRR e.g. "Sample" without the /
		"""
        # http://stackoverflow.com/questions/680305/
        # using-multipartposthandler-to-post-form-data-with-python
        # Register the streaming http handlers with urllib2
        opener = register_openers()
        # https://bitbucket.org/chrisatlee/poster/issue/7/
        # multipart-form-post-doesnt-work-with
        if _PROXY:
            opener.add_handler(ProxyHandler({_PROXY_TYPE: _PROXY_URL}))

        # Start the multipart/form-data encoding of the file "DSC0001.jpg"
        # "image1" is the name of the parameter, which is normally set
        # via the "name" parameter of the HTML <input> tag.

        # Ensure file is Unicode:
        filename = filename.decode(sys.getfilesystemencoding())

        # new_headers contains the necessary Content-Type and Content-Length
        # datagen is a generator object that yields the encoded parameters
        datagen, new_headers = multipart_encode({
            "folder": folder,
            "MAX_FILE_SIZE": _MAX_FILE_SIZE,
            "file": open(filename, "rb"),
            "add": "Add",
        })
        headers = dict(self.headers)  # makes copy original dict
        headers.update(new_headers)
        url = self.baseurl + "release/add/" + release.replace(' ', '%20')
        request = Request(url, datagen, headers)
        opener.add_handler(HTTPCookieProcessor(self.cj))

        if folder != "":
            fn = folder + "/"
        else:
            fn = ""
        fn += os.path.basename(filename)

        # Actually do the request, and get the response
        try:
            handle = urllib2.urlopen(request)
            html_source = handle.read()

            # sre_constants.error: unbalanced parenthesis
            if len(re.findall(".*%s.*" % re.escape(fn), html_source)):
                print("'%s' successfully uploaded." % fn)
                # also gives this result if it was already there in the first place
                success = True

    #		elif len(re.findall(".*an error occurred while adding the file.*",
    #							html_source)):
    #			print("!!! '%s': file already added." % fn)
    #			success = False
            elif len(
                    re.findall("You were redirected to this page",
                               html_source)):
                # grab release name from top of details page
                match = re.search(".*RELEASE .*value=\"(.*)\".*", html_source)
                if match:
                    release = match.group(1)
                    print("??? Redirecting to '%s'." % release)
                    success = self.add_file(self, release, filename, folder)
                else:
                    print("!!! Error uploading file to '%s'." % release)
                    success = False
            else:
                print(html_source)
                print("The site has been changed.")
                success = False
                if "<html" not in html_source:
                    # keep retrying again in this case
                    raise httplib.HTTPException("No HTML recieved")
        except urllib2.HTTPError as e:
            if e.code == 404:
                print("!!! '%s': no such release." % release)
                success = False
            else:
                raise

        return success
示例#24
0
 def __init__(self, agent=_msie, cookies=True, handlers=[]):
     self.agent = agent
     if cookies:
         handlers.append(HTTPCookieProcessor(CookieJar()))
     self.opener = build_opener(*handlers)
示例#25
0
	log(content, xbmc.LOGERROR)

def debug(content):
	log(content, xbmc.LOGDEBUG)

def log(msg, level=xbmc.LOGNOTICE):
	msg = py2_enc(msg)
	xbmc.log("["+addon.getAddonInfo('id')+"-"+addon.getAddonInfo('version')+"]"+msg, level)

cookie = os.path.join(temp, 'cookie.jar')
cj = LWPCookieJar()

if xbmcvfs.exists(cookie):
	cj.load(cookie, ignore_discard=True, ignore_expires=True)

opener = build_opener(HTTPCookieProcessor(cj))
baseURL="https://www.anime-on-demand.de"


class Infowindow(pyxbmct.AddonDialogWindow):    
	text=""
	pos=0
	image=""
	trailer=""
	starttext=""
	def __init__(self, text=''):
		self.ueberschrift=re.compile('<h1 style="margin: 0;">(.+?)</h1>', re.DOTALL).findall(text)[0]
		try:
			self.image= re.compile('class="newspic" src="(.+?)"', re.DOTALL).findall(text)[0]
			if self.image[:4] != "http":
				self.image = baseURL+self.image
示例#26
0
import csv
import json
import sys
from cookielib import CookieJar
from urllib2 import build_opener, HTTPCookieProcessor
from cStringIO import StringIO
from difflib import SequenceMatcher

spreadsheet_url = 'https://docs.google.com/spreadsheet/ccc?key=13bmt8pwh4x4GFTnoctxkxjKjsxDtYwwXbGS6ZEB-ik8&output=csv'
local_json_file = 'quiz.json'

opener = build_opener(HTTPCookieProcessor(CookieJar()))
resp = opener.open(spreadsheet_url)
data = resp.read()

res = []
for index, question in enumerate(csv.DictReader(StringIO(data))):
	# ids can be nonsequential
	if question['ID']:
		res.append({
			"ID": index,
			"question": question['Android Test Question'],
			"right": [i for i in question['Right Answer(s)'].split("\n") if i],
			"wrong": [i for i in question['Wrong Answer(s)'].split("\n") if i],
			"tags": [i.strip() for i in question['Question Tag'].split(",") if i],
			"docRef" : question["Reference Link"],
		})
		print([i.strip() for i in question['Question Tag'].split(",") if i])

# cannot import local modules like
# from checked_questions import reviewed
示例#27
0
    def download_file_with_cookiejar(self, url, file_count, total, recursion=False):
        # see if we've already download this file and if it is that it is the correct size
        download_file = os.path.basename(url).split('?')[0]
        if os.path.isfile(download_file):
            try:
                request = Request(url)
                request.get_method = lambda: 'HEAD'
                response = urlopen(request, timeout=30)
                remote_size = self.get_total_size(response)
                # Check that we were able to derive a size.
                if remote_size:
                    local_size = os.path.getsize(download_file)
                    if remote_size < (local_size+(local_size*.01)) and remote_size > (local_size-(local_size*.01)):
                        print(" > Download file {0} exists! \n > Skipping download of {1}. ".format(
                            download_file, url))
                        return None, None
                    # partial file size wasn't full file size, lets blow away the chunk and start again
                    print(" > Found {0} but it wasn't fully downloaded. Removing file and downloading again.".format(
                        download_file))
                    os.remove(download_file)

            except ssl.CertificateError as e:
                print(" > ERROR: {0}".format(e))
                print(
                    " > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag")
                return False, None

            except HTTPError as e:
                if e.code == 401:
                    print(
                        " > IMPORTANT: Your user may not have permission to download this type of data!")
                else:
                    print(
                        " > Unknown Error, Could not get file HEAD: {0}".format(e))

            except URLError as e:
                print("URL Error (from HEAD): {0}, {1}".format(e.reason, url))
                if "ssl.c" in "{0}".format(e.reason):
                    print(
                        "IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.")
                return False, None

        # attempt https connection
        try:
            request = Request(url)
            response = urlopen(request, timeout=30)

            # Watch for redirect
            if response.geturl() != url:

                # See if we were redirect BACK to URS for re-auth.
                if 'https://urs.earthdata.nasa.gov/oauth/authorize' in response.geturl():

                    if recursion:
                        print(" > Entering seemingly endless auth loop. Aborting. ")
                        return False, None

                    # make this easier. If there is no app_type=401, add it
                    new_auth_url = response.geturl()
                    if "app_type" not in new_auth_url:
                        new_auth_url += "&app_type=401"

                    print(
                        " > While attempting to download {0}....".format(url))
                    print(" > Need to obtain new cookie from {0}".format(
                        new_auth_url))
                    old_cookies = [cookie.name for cookie in self.cookie_jar]
                    opener = build_opener(HTTPCookieProcessor(
                        self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context))
                    request = Request(new_auth_url)
                    try:
                        response = opener.open(request)
                        for cookie in self.cookie_jar:
                            if cookie.name not in old_cookies:
                                print(" > Saved new cookie: {0}".format(
                                    cookie.name))

                                # A little hack to save session cookies
                                if cookie.discard:
                                    cookie.expires = int(
                                        time.time()) + 60*60*24*30
                                    print(
                                        " > Saving session Cookie that should have been discarded! ")

                        self.cookie_jar.save(
                            self.cookie_jar_path, ignore_discard=True, ignore_expires=True)
                    except HTTPError as e:
                        print("HTTP Error: {0}, {1}".format(e.code, url))
                        return False, None

                    # Okay, now we have more cookies! Lets try again, recursively!
                    print(" > Attempting download again with new cookies!")
                    return self.download_file_with_cookiejar(url, file_count, total, recursion=True)

                print(
                    " > 'Temporary' Redirect download @ Remote archive:\n > {0}".format(response.geturl()))

            # seems to be working
            print("({0}/{1}) Downloading {2}".format(file_count, total, url))

            # Open our local file for writing and build status bar
            tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False, dir='.')
            self.chunk_read(response, tf, report_hook=self.chunk_report)

            # Reset download status
            sys.stdout.write('\n')

            tempfile_name = tf.name
            tf.close()

        # handle errors
        except HTTPError as e:
            print("HTTP Error: {0}, {1}".format(e.code, url))

            if e.code == 401:
                print(
                    " > IMPORTANT: Your user does not have permission to download this type of data!")

            if e.code == 403:
                print(" > Got a 403 Error trying to download this file.  ")
                print(" > You MAY need to log in this app and agree to a EULA. ")

            return False, None

        except URLError as e:
            print("URL Error (from GET): {0}, {1}, {2}".format(
                e, e.reason, url))
            if "ssl.c" in "{0}".format(e.reason):
                print(
                    "IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.")
            return False, None

        except socket.timeout as e:
            print(" > timeout requesting: {0}; {1}".format(url, e))
            return False, None

        except ssl.CertificateError as e:
            print(" > ERROR: {0}".format(e))
            print(
                " > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag")
            return False, None

        # Return the file size
        shutil.copy(tempfile_name, download_file)
        os.remove(tempfile_name)
        file_size = self.get_total_size(response)
        actual_size = os.path.getsize(download_file)
        if file_size is None:
            # We were unable to calculate file size.
            file_size = actual_size
        return actual_size, file_size
示例#28
0
from subprocess import Popen, PIPE

from lxml import html

COOKIES_FILE = '/usr/local/etc/bandcamp.cookies'

URL = 'https://bandcamp.com'
CDN_COVERS = 'https://f4.bcbits.com/img'

cj = LWPCookieJar()

if os.path.isfile(COOKIES_FILE):
    cj.load(COOKIES_FILE)

handler = HTTPHandler(debuglevel=0)
opener = build_opener(handler, HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Enter you own user agent !'),
                     ('Accept', '*/*'), ('Accept-Encoding', 'deflate')]

TMP_PATH = ''
TMP_FILE_PREFIX = 'tmpS_'
queue = Queue()

# Do we have to download then add cover to downloaded music file ?
ADD_COVER = 1

# Keep the cover file ?
KEEP_COVER_FILE = 0

# Infinite DL ?
INFINITE_DL = 1
    def check_cookie(self):

        if self.cookie_jar is None:
            print(" > Cookiejar is bunk: {0}".format(self.cookie_jar))
            return False

        # File we know is valid, used to validate cookie
        file_check = 'https://urs.earthdata.nasa.gov/profile'

        # Apply custom Redirect Hanlder
        opener = build_opener(HTTPCookieProcessor(self.cookie_jar),
                              HTTPHandler(), HTTPSHandler(**self.context))
        install_opener(opener)

        # Attempt a HEAD request
        request = Request(file_check)
        request.get_method = lambda: 'HEAD'
        try:
            print(" > attempting to download {0}".format(file_check))
            response = urlopen(request, timeout=30)
            resp_code = response.getcode()
            # Make sure we're logged in
            if not self.check_cookie_is_logged_in(self.cookie_jar):
                return False

            # Save cookiejar
            self.cookie_jar.save(self.cookie_jar_path)

        except HTTPError:
            # If we ge this error, again, it likely means the user has not agreed to current EULA
            print("\nIMPORTANT: ")
            print(
                "Your user appears to lack permissions to download data from the ASF Datapool."
            )
            print(
                "\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov"
            )
            exit(-1)

        # This return codes indicate the USER has not been approved to download the data
        if resp_code in (300, 301, 302, 303):
            try:
                redir_url = response.info().getheader('Location')
            except AttributeError:
                redir_url = response.getheader('Location')

            # Funky Test env:
            if ("vertex.daac.asf.alaska.edu" in redir_url
                    and "test" in self.asf_urs4['redir']):
                print("Cough, cough. It's dusty in this test env!")
                return True

            print("Redirect ({0}) occured, invalid cookie value!".format(
                resp_code))
            return False

        # These are successes!
        if resp_code in (200, 307):
            return True

        return False
示例#30
0
 def __init__(self):
     proxy = ProxyHandler(PROXY)
     self.cj = CookieJar()
     opener = build_opener(HTTPCookieProcessor(self.cj), proxy)
     install_opener(opener)