Exemplo n.º 1
0
def getHtml(url,charset=None):
    if(charset is None):
        return getHtmlDefault(url)
    else:
        response = getResponse(url)
        if(response is None):
            return None
        else:
            if (response.headers.get('Content-Encoding') == 'gzip'):
                # decompress
                page = decompress(response.read())
            else:
                page = response.read()
            return page.decode(charset,'ignore')
Exemplo n.º 2
0
 def get_sub_monitor_bill(self, main_sn):
     """get feedback subSn by mainSn.
     
     """
     sub_url = ("http://%s/ida30/jsp/svr/billdeal/common/feedback.jsp?"
     "flag=S&recordSn=&mainSn=%s" % (self.host, main_sn))
     record_sn = ""
     sub_sn = ""
     
     try:
         response = urllib.request.urlopen(sub_url)
         content = str(response.read())
         i = 0
         for m in self.members:
             i = content.find(m)
             if i > 0:
                 break
             
         if i > 0:
             i = content.rfind('value="', 0, i) + 7
             e = content.find('"', i)
             record_sn = content[i:e]
             i = content.find('">', e) + 2
             e = content.find('<', i)
             sub_sn = content[i:e]
         return record_sn + "$" + sub_sn
     except (URLError, HTTPError):
         self.write_log(("Error: Access SubSn URL fail.", main_sn, self.uid), "%s [%s] (%s)")
         return False
Exemplo n.º 3
0
    def fb_monitor_bill(self, main_sn):
        """feedback monitor bill automatically.
        
        """
        record_sn = self.get_sub_monitor_bill(main_sn)
        if not record_sn:
            return False
        elif record_sn != "$":
            record_sn, main_sn = record_sn.split("$")
            fb_url = ("http://%s/ida30/svr/net/CommonAction.do?"
            "method=feedback&recordSn=%s&flag=S" % (self.host, record_sn))
        else:
            fb_url = ("http://%s/ida30/svr/net/CommonAction.do?"
            "method=mainFeedback&mainSn=%s&flag=S" % (self.host, main_sn))

        data = urlencode({"percent": "0", "procCode": "99", "procCodeText": "其他", "procDesc": "网络运行一切正常。"})
        request = urllib.request.Request(fb_url, data)
        
        try:  
            response = urllib.request.urlopen(request)
            content = str(response.read())
            if content.find('flgSuc = "Y"') > 0:
                self.write_log(("Feedback OK.", main_sn, self.uid), "%s [%s] (%s)")
                return True
            else:
                self.write_log(("Error: Feedback fail.", main_sn, self.uid), "%s [%s] (%s)")
                return False
        except (URLError, HTTPError):
            self.write_log(("Error: Access Feedback URL fail.", main_sn, self.uid), "%s [%s] (%s)")
            return False
Exemplo n.º 4
0
def getResult(query):

    url = getUrl(query)

    import urllib.request
    with urllib.request.urlopen(url) as response:
        html = response.read()
    html = str(html, 'utf-8')
    html = html.encode(sys.stdout.encoding, errors='ignore')
    html = str(html)
    html = html.split('\\n')
    i = 0
    result = ''

    for line in html:
        if (i > 64 ):
            if ('[' in line):
                break
            if ('[23]Image' in line):
                continue
            if( 'About' in line and 'results' in line):
                continue
            result = result + '\n' + line
        i = i+1
    return result
Exemplo n.º 5
0
def get_url_content(website):
    headers = {'Accept-Charset': 'utf-8',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'}
    request = urllib.request.Request(website, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    response.close()
    return html
Exemplo n.º 6
0
def get_content(link):
    # Crawl Website Moviebarcodes.tumblr.com/movie-index
    response = urllib.request.urlopen(link)
    str_response = unescape(response.read().decode('utf-8'))

    # debugKH(str_response)
    # use inputstring
    process_file(str_response)
    return str_response
Exemplo n.º 7
0
    def get_player_information(self, Player=None):
        if Player is None:
            return None

        url = Consts.URL['base'] + Consts.URL['player'] + Player.id + '-' + Player.firstName + '-' + Player.lastName
        req = urllib.request.Request(url)
        response = urllib.request.urlopen(req)
        response_data = response.read()
        return response_data
Exemplo n.º 8
0
def save_url_to_file(url, filename):
    """

    :rtype: Int
    """
    with urllib.request.urlopen(url) as response:
        with open(filename, 'wb') as file:
            if file.write(response.read()):
                return 0;
    return 1
Exemplo n.º 9
0
def getIssueList():
    """ Return all issues for REPO_ID """
    request = urllib.request.Request(URL + "/api/v3/projects/" + REPO_ID + "/issues", headers={"PRIVATE-TOKEN" : TOKEN })
    context = ssl._create_unverified_context()
    try:
        response = urllib.request.urlopen(request,context=context)
    except HTTPError as e:
        return e.read().decode("utf-8")

    return json.loads(response.read().decode("utf-8"))
Exemplo n.º 10
0
    def get_personal_info(self, Employee=None):
        if Employee is None:
            return None

        url = Consts.URL['base'] + Consts.URL['employee'] + Employee.id
        req = urllib.request.Request(url)
        response = urllib.request.urlopen(req)
        responseData = response.read()

        return responseData
Exemplo n.º 11
0
    def get_player_transfer_history(self, Player=None):
        if Player is None:
            return None

        url = Consts.URL['base'] + Consts.URL['player'] + Player.id + '-' + Player.firstName + '-' + Player.lastName + \
              Consts.URL['history']
        req = urllib.request.Request(url)
        response = urllib.request.urlopen(req)
        responseData = response.read()

        return responseData
Exemplo n.º 12
0
  def _GetAuthToken(self, email, password):
    """Uses ClientLogin to authenticate the user, returning an auth token.

    Args:
      email:    The user's email address
      password: The user's password

    Raises:
      ClientLoginError: If there was an error authenticating with ClientLogin.
      HTTPError: If there was some other form of HTTP error.

    Returns:
      The authentication token returned by ClientLogin.
    """
    account_type = self.account_type
    if not account_type:

      if (self.host.split(':')[0].endswith(".google.com")
          or (self.host_override
              and self.host_override.split(':')[0].endswith(".google.com"))):

        account_type = "HOSTED_OR_GOOGLE"
      else:
        account_type = "GOOGLE"
    data = {
        "Email": email,
        "Passwd": password,
        "service": "ah",
        "source": self.source,
        "accountType": account_type
    }


    req = self._CreateRequest(
        url=("https://%s/accounts/ClientLogin" %
             os.getenv("APPENGINE_AUTH_SERVER", "www.google.com")),
        data=urllib.parse.urlencode(data))
    try:
      response = self.opener.open(req)
      response_body = response.read()
      response_dict = dict(x.split("=")
                           for x in response_body.split("\n") if x)
      if os.getenv("APPENGINE_RPC_USE_SID", "0") == "1":
        self.extra_headers["Cookie"] = (
            'SID=%s; Path=/;' % response_dict["SID"])
      return response_dict["Auth"]
    except urllib.error.HTTPError as e:
      if e.code == 403:
        body = e.read()
        response_dict = dict(x.split("=", 1) for x in body.split("\n") if x)
        raise ClientLoginError(req.get_full_url(), e.code, e.msg,
                               e.headers, response_dict)
      else:
        raise
Exemplo n.º 13
0
    def __init__(self, url):
        response = None

        request = urllib.request.Request(url)
        request.add_header("User-Agent", self.user_agent)
        # noinspection PyBroadException
        try:
            response = urllib.request.urlopen(request)
        except:
            print("Error: Invalid URL. Exiting.")
            exit()
        html_content = response.read().decode("utf8")
        self.__parse_content(html_content)
Exemplo n.º 14
0
def getHtmlDefault(url):
    response = getResponse(url)
    page=None
    if(response == None):
        return None
    if(response.headers.get('Content-Encoding') == 'gzip'):
        #decompress
        page = decompress(response.read())
    else:
        page = response.read()
    charset = chardet.detect(page)
    time = 0
    while (charset['encoding'] == None and time < DETECT_TIME):
        charset = chardet.detect(page)
        ++time

    if (charset['encoding'] != None):
        html = page.decode(charset['encoding'],'ignore')
        return html
    else:
        print("can't decode %s" % url)
        return None
Exemplo n.º 15
0
def urlget_Test1(msg):

  return
  
  import urllib.parse
  import urllib.request
  import urllib.response

  print('urlget_Test1()')

  #url = 'http://www.someserver.com/cgi-bin/register.cgi'
  url = 'http://mycase.in.gov/default.aspx'
  user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  values = {'name' : 'Michael Foord',
	    'location' : 'Northampton',
	    'language' : 'Python' }
  headers = { 'User-Agent' : user_agent }

  print('Header: ' + str(headers))

  data = urllib.parse.urlencode(values)
  #req = urllib.request.Request(url, data, headers)
  req = urllib.request.Request(url, b'User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
  #req = urllib.request.Request(url, headers)
  #req = urllib.request.Request(url)
  response = urllib.request.urlopen(req)
  the_page = response.read()
  print(the_page)
  exit()
  try:
    response = urllib.request.urlopen(req)
  except urllib.error.HTTPError as e:
    print(e.code)
    print(e.read())
    exit(1)
  else:  
    the_page = response.read()
    print(the_page)
  exit()  
Exemplo n.º 16
0
def download_manifest_as_json():
  """Download and parse the remote manifest

  The full file is downloaded and parsed as JSON; the resulting dict
  is returned.

  """
  manifest_url = os.getenv('FLOORED_MANIFEST_URL',
                           'http://blender.floored.com/blendloft/manifest.json')
  _log.info('manifest url: %s', manifest_url)
  req = urllib.request.Request(manifest_url, headers=_download_headers)
  with urllib.request.urlopen(req) as response:
    text = response.read().decode(encoding='UTF-8')
    return json.loads(text)
Exemplo n.º 17
0
 def accept_monitor_bill(self, record_sn, main_sn):
     """accept monitor bill automatically.
     
     """
     accept_url = ("http://%s/ida30/svr/net/AcceptAction.do?"
     "method=acceptBatch&recordSn=%s&mainSns=%s&businessCode=IDB_SVR_NET" % (self.host, record_sn, main_sn))
     try:
         response = urllib.request.urlopen(accept_url)
         content = str(response.read())
         if content.find('异常页面') < 0:
             self.write_log(("Accept OK.", main_sn, self.uid), "%s [%s] (%s)")
             return True
     except (URLError, HTTPError):
         self.write_log("Error: Access Accept URL fail.", "%s", log_flag=False)
         return False
def request_until_succeed(url):
    req = urllib.request.Request(url)
    success = False
    while success is False:
        try: 
            response = urllib.request.urlopen(req)
            if response.getcode() == 200: 
                success = True
        except Exception as e:
            print (e)
            time.sleep(5)
            
            print ("Error for URL %s: %s" % (url, datetime.datetime.now()))

    return response.read().decode('utf8')
Exemplo n.º 19
0
    def get_personal_search(self, role, page=1, min_age=0, max_age=99):
        if role is None:
            return None

        url = Consts.URL['base'] + Consts.URL['employee search'] + '&speciality=' + role + '&page=' + str(page)
        values = '&country_id=' + '&job_status=1' + '&age_min=' + str(min_age) + '&age_max='\
                 + str(max_age) + '&search=1&commit=S%C3%B8g'

        try:
            response = self.connection.open(url + values)
            responseData = response.read()
        except:
            responseData = '404'

        return responseData
Exemplo n.º 20
0
	def get_article(self):
		try:
			req=urllib.request.Request(self._myUrl,headers=self._headers)
			print('Obtain response from the server.......')
		except:
			print('Fail to get response.........')
		response=urllib.request.urlopen(req)
		result=response.read()
		unicodepage=result.decode('gb2312', 'ignore')
		the_time = re.search(r'<title>(.*)</title>',unicodepage,re.DOTALL).group(1)
		s_split = re.split('<div class="field-items">', unicodepage)
		thearticle = s_split[1]
		thearticle = re.sub('<img .*?/>',"",thearticle)
		thearticle = re.split('<div id="footer">',thearticle)[0]
		thearticle = "<html><head>"+self._css +"</head><body><h1>"+the_time+"</h1>"+thearticle+"</body></html>"
		return thearticle
Exemplo n.º 21
0
	def get_json(self):
		#Form get request
		url = 'http://www.omdbapi.com/?'
		values = {'t': self.get_title(), 'y': '', 
			'plot': 'short', 'r': 'json'}
		data = urllib.parse.urlencode(values)
		full_request = url + data
		
		#Request to server
		response = urllib.request.urlopen(full_request)
	
		#Read data in response object ad decode
		response = response.read().decode("utf-8")
	
		#Convert response to dict
		response = ast.literal_eval(response)
		return response
Exemplo n.º 22
0
def query(query, useragent='python-duckduckgo '+str(__version__), safesearch=True, html=False, meanings=True, **kwargs):
    """
    Query DuckDuckGo, returning a Results object.

    Here's a query that's unlikely to change:

    --- result = query('1 + 1')
    --- result.type
    'nothing'
    --- result.answer.text
    '1 + 1 = 2'
    --- result.answer.type
    'calc'

    Keword arguments:
    useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str)
    safesearch: True for on, False for off. Default: True (bool)
    html: True to allow HTML in output. Default: False (bool)
    meanings: True to include disambiguations in results (bool)
    Any other keyword arguments are passed directly to DuckDuckGo as URL params.
    """

    safesearch = '1' if safesearch else '-1'
    html = '0' if html else '1'
    meanings = '0' if meanings else '1'
    params = {
        'q': query,
        'o': 'json',
        'kp': safesearch,
        'no_redirect': '1',
        'no_html': html,
        'd': meanings,
        }
    params.update(kwargs)
    encparams = urllib.parse.urlencode(params)
    url = 'http://api.duckduckgo.com/?' + encparams

    request = urllib.request.Request(url, headers={'User-Agent': useragent})
    response = urllib.request.urlopen(request)
    json = j.loads(response.read().decode('utf-8'))
    response.close()

    return Results(json)
Exemplo n.º 23
0
    def validate_code(self):
        """get validate code from url.
        
        """
        rand_date = "%s" % datetime.now()
        rand_date = rand_date.replace(' ', '%20')
        is_change_color = "N"
        validate_url = "http://%s/validateCode?randDate=%s&isChangColor=%s" % (self.host, rand_date, is_change_color)

        try:
            response = urllib.request.urlopen(validate_url)
            code = response.read()
        except (URLError, HTTPError):
            self.write_log("Error: Access ValidateCode URL fail.", "%s", log_flag=False)
            return False
        else:
            with open("validate.png", 'wb') as image_file:
                image_file.write(code)

            img = Image.open("validate.png")
            pix = img.load()
            width, height = img.size

            code_str = ''
            for i in range(width):
                for j in range(height):
                    for p in pix[i, j]:
                        code_str = code_str + str(p)

            with sqlite3.connect("feedback3.db") as conn:
                conn.text_factory = str
                cu = conn.cursor()
                sql = "select image_code from t_v_code where image_md5='%s'"
                cu.execute(sql % hashlib.md5(code_str.encode()).hexdigest())
                image_code = cu.fetchone()
            
            if image_code is None:
                return False
            else:
                return image_code[0]
Exemplo n.º 24
0
def save_url_to_file_with_auth(url, filename, username, password):
    # create a password manager
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

    password_mgr.add_password(None, url, username, password)

    handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

    # create "opener" (OpenerDirector instance)
    opener = urllib.request.build_opener(handler)

    # use the opener to fetch a URL
    opener.open(url)

    # Install the opener.
    # Now all calls to urllib.request.urlopen use our opener.
    urllib.request.install_opener(opener)

    with urllib.request.urlopen(url) as response:
        with open(filename, 'wb') as file:
            if file.write(response.read()):
                return 0;
    return 1
Exemplo n.º 25
0
#with open('SampleRequest.json') as json_file:
#   data = json.load(json_file)

#json_file = open('SampleRequest.josn')

f = open("SampleRequest.json", "r")
file = json.load(f)

print(file)

headers = {
    'Content-Type': 'application/json',
    'Ocp-Apim-Subscription-Key': '-------',
}

#body = json.dumps()

params = urllib.parse.urlencode({})

try:
    conn = http.client.HTTPSConnection('westus2.api.cognitive.microsoft.com')
    conn.request("POST", "/text/analytics/v2.1/entities?%s" % params, file,
                 headers)
    response = conn.getresponse()
    data = response.read()
    print(data)
    conn.close()
except Exception as e:
    #print("[Errno {0}] {1}".format(e.errno, e.strerror))
    print("ohno")
Exemplo n.º 26
0
def getPage(pageNo):
    url = "http://www.xiumeim.com/albums/page-%s.html" % pageNo
    request = urllib.request.Request(url)
    response = opener.open(request)
    return str(response.read().decode())
#다음 웹툰 > 어쩌다 발견한  7월
from urllib import response
from urllib.request import urlopen

import json

if __name__ == '__main__':
    # response = urlopen("http://webtoon.daum.net/data/pc/webtoon/view/findjuly")
    # response_byte = response.read()
    # response_json = json.loads(response_byte)

    with urlopen(
            "http://webtoon.daum.net/data/pc/webtoon/view/findjuly") as data:
        response_byte = response.read()
    response_json = json.loads(response_byte)
    #print(json.loads(response.read()))
    # print(response_json['data']['webtoon']['webtoonEpisodes'][11]['title'])

    cartoon_titles = response_json['data']['webtoon']['webtoonEpisodes']
    for item in cartoon_titles:
        title = item['title']
        thumbnail = item['thumbnailImage']['url']
        print(title)
        print(thumbnail)
Exemplo n.º 28
0
#!/usr/bin/env python
#-*- encoding:utf-8 -*-
from urllib import request, response
import chardet
if __name__=="__main__":
    response=request.urlopen("http://www.baidu.com");
    html=response.read();
    html=html.decode("utf-8");
    html=chardet.detect(bytes(html,encoding="utf-8"));
    print(html);
    

Exemplo n.º 29
0
import urllib.request
import urllib.response
from urllib.parse import urlencode

values = {"username": "******", "password": "******"}
data = urlencode(values)
print(data)
url = "http://quote.eastmoney.com/stocklist.html"
request = urllib.request.Request('%s?%s' % (url, data))
#request = urllib.request.Request(url,data.encode('utf-8'))
response = urllib.request.urlopen(request)
print(response.read().decode('gbk'))
Exemplo n.º 30
0
#!/usr/bin/python3
"""
displays the value of the X-Request-Id variable found in header of respon\
se
"""
import urllib.request
import urllib.parse
import sys
import urllib.response
import urllib.error

if __name__ == "__main__":
    try:
        req = urllib.request.Request(sys.argv[1])
        with urllib.request.urlopen(req) as response:
            the_page = response.read().decode('utf-8')
            print(the_page)
    except urllib.error.URLError as e:
        ResponseData = e.read().decode("utf8", 'ignore')
        print('Error code: {}'.format(e.code))
Exemplo n.º 31
0
    def content(self):
        li = []
        for j in range(274, 275):
            for i in range(j, j + 1):
                url = 'http://yuanjian.cnki.net/Search/Result'
                print('当前页', i)
                time.sleep(random.random() * 3)
                formdata = {'Type': 1,
                            'Order': 1,
                            'Islegal': 'false',
                            'ArticleType': 1,
                            'Theme': 'XRD',
                            'searchType': 'MulityTermsSearch',
                            'ParamIsNullOrEmpty': 'true',
                            'Page': i}

                try:
                    #r = requests.post(url, data=formdata, headers=self.headers, cookies=self.cookies , params=self.param)
                    #print(111)
                    # r.raise_for_status()
                    # print(222)
                    #r.encoding = r.apparent_encoding
                    #print(333)

                    request = requests.post(url=url, data=formdata, headers=self.headers, cookies=self.cookies , params=self.param )
                    try:
                        print(123)
                        response = requests.post.urlopen(request, timeout=1)
                        print(121)
                        html = response.read().decode("utf-8")
                        print(html)
                    except urllib.error as e:
                        if hasattr(e, "code"):
                            print(e.code)
                        if hasattr(e, "reason"):
                            print(e.reason)
                    #eturn html

                    data = etree.HTML(html.text)
                    print(data)
                    # 链接列表
                    url_list = data.xpath("//*[@id='article_result']/div/div/p[1]/a[1]/@href")
                    print(url_list)
                    # 关键词列表
                    key_wordlist = []
                    all_items = data.xpath("//*[@id='article_result']/div/div")
                    print (len(all_items))
                    for i in range(1, len(all_items) + 1):
                        key_word = data.xpath("//*[@id='article_result']/div/div[%s]/div[1]/p[1]/a/text()" % i)

                        key_words = ';'.join(key_word)
                        key_wordlist.append(key_words)
                    # 来源
                    source_items = data.xpath("//*[@id='article_result']/div/div")
                    for j in range(1, len(source_items) + 1):
                        sources = data.xpath("//*[@id='article_result']/div/div/p[3]/a[1]/span/text()")
                    for index, url in enumerate(url_list):
                        items = {}
                        try:
                            print('当前链接:', url)
                            content = requests.get(url, headers=self.headers)
                            contents = etree.HTML(content.text)
                            # 论文题目
                            title = contents.xpath("//h1[@class='xx_title']/text()")[0]
                            items['titleCh'] = title
                            items['titleEn'] = ''
                            print('标题:', title)
                            # 来源
                            source = sources[index]
                            items['source'] = source
                            print('来源:', source)

                            # 关键字
                            each_key_words = key_wordlist[index]
                            print('关键字:', each_key_words)
                            items['keywordsEn'] = ''
                            items['keywordsCh'] = each_key_words
                            # 作者
                            author = contents.xpath("//*[@id='content']/div[2]/div[3]/a/text()")
                            items['authorCh'] = author
                            items['authorEn'] = ''
                            print('作者:', author)
                            # 单位
                            unit = contents.xpath("//*[@id='content']/div[2]/div[5]/a[1]/text()")
                            units = ''.join(unit).strip(';')
                            items['unitCh'] = units
                            items['unitEn'] = ''
                            print('单位:', units)
                            # 分类号
                            classify = contents.xpath("//*[@id='content']/div[2]/div[5]/text()")[-1]
                            c = ''.join(classify).split(';')
                            res = []
                            for name in c:
                                print('当前分类号:', name)
                                try:
                                    if name.find("TP391.41") != -1:
                                        print('改变分类号!')
                                        name = 'TP391.4'
                                    result = requests.get('http://127.0.0.1:5000/%s/' % name)
                                    time.sleep(5)
                                    re_classify1 = result.content
                                    string = str(re_classify1, 'utf-8')
                                    classify_result = eval(string)['classfiy']
                                    # print('文献分类导航:', classify_result)

                                except Exception as e:
                                    print(e)
                                res.append(classify_result)
                                print('文献分类导航:', res)
                            items['classify'] = res

                            # 摘要
                            abstract = contents.xpath("//div[@class='xx_font'][1]/text()")[1].strip()
                            print('摘要:', abstract)
                            items['abstractCh'] = abstract
                            items['abstractEn'] = ''
                            # 相似文献
                            similar = contents.xpath(
                                "//*[@id='xiangsi']/table[2]/tbody/tr[3]/td/table/tbody/tr/td/text()")
                            si = ''.join(similar).replace('\r\n', '').split('期')
                            po = []
                            for i in si:
                                sis = i + '期'
                                if len(sis) > 3:
                                    po.append(sis)

                            items['similar_article'] = po
                            # 参考文献
                            refer_doc = contents.xpath("//*[@id='cankao']/table[2]/tbody/tr[3]/td/table/tbody/tr/td/text()")
                            items['refer_doc'] = refer_doc

                            li.append(items)

                        except Exception as e:
                            print(e)
                        print(len(li))
                except Exception as e:
                    print(e)

        return li
Exemplo n.º 32
0
def download(url, headers={}):
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    content = response.read().decode('utf-8')
    response.close()
    return content
Exemplo n.º 33
0
            noDESCnoMAPwriter.writeheader()

    with open("websitet14.csv", newline="") as csvfile:
        # variables added to a list

        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            info = False
            description = False
            maPrice = False
            # row[2] contains the internal part number for website
            request = urllib.request.Request(
                'http://www.website.com/search.php?search=' + row[2] +
                '&type=Part+%23')
            response = opener.open(request)
            pagedata = response.read()
            soup = BeautifulSoup(pagedata.decode('utf-8', 'ignore'))

            # Cost, Retail, Jobber, and Map area on the websitet14.csv
            # row[4], row[5], row[6], and row[7] respectively

            productrow.row["Vendor"] = row[0]

            productrow.row["Variant SKU"] = row[2]
            if row[5] != "\\N":
                # the retail price that's marked out on the webpage
                productrow.row["Variant Compare At Price"] = row[5]

            productrow.row["Variant Price"] = row[7]
            if productrow.row["Variant Price"] != "\\N":
                maPrice = True
Exemplo n.º 34
0
 def get_page(self, page_index):
     url = self.SiteUrl + '?page=' + str(page_index)
     request = urllib.request.Request(url)
     response = urllib.request.urlopen(request)
     return response.read().decode('gbk')
import urllib.response
import urllib.request
import urllib.parse
from urllib import response

URL = 'https://baike.baidu.com/item/'
new_url = "相声有新人/22779051"
new_url = urllib.parse.quote(new_url)
date = {"fr": "aladdin"}
date = urllib.parse.urlencode(date)
URL = URL + new_url + "?" + date
headers = {
    "user-agent":
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
}
request = urllib.request.Request(URL, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode()
print(html)
Exemplo n.º 36
0
import urllib
from urllib import request, response
url = 'http://www.baidu.com/'

# 返回响应
response = urllib.request.urlopen(url=url)

content = response.read().decode('utf-8')
print(content)
Exemplo n.º 37
0
def save_url_to_file(url, filename):
    with urllib.request.urlopen(url) as response:
        with open(filename, 'wb') as file:
            if file.write(response.read()):
                return 0
    return 1
Exemplo n.º 38
0
#初始化
Nasdaq=''
Hetf=''
Sp=''
NasdaqSig=0
SpSig=0
HetfSig=0
urls=['http://www.qqjjsj.com/gpsc/index.html','http://www.qqjjsj.com/gpsc/index_2.html','http://www.qqjjsj.com/gpsc/index_3.html','http://www.qqjjsj.com/gpsc/index_4.html']


#打开网页
for url in urls:
    request=urllib.request.Request(url)
    response=urllib.request.urlopen(request)
    content=response.read().decode('utf-8')

##纳斯达克
    if NasdaqSig==0:
        NasdaqPattern=re.compile(r'\d+年\d+月\d+日纳斯达克.*?平均市盈率为?\d+.\d+')
        NasdaqItems=re.findall(NasdaqPattern,content)
        if NasdaqItems:
            Nasdaq=NasdaqItems[0]+'\n'
            NasdaqSig=1
            with open('Nasdaq.txt', 'a') as f:
                f.write(Nasdaq)


##标准普尔
    if SpSig==0:
        SpPattern=re.compile(r'\d+年\d+月\d+日标准普尔.*?平均市盈率为?\d+.\d+')
Exemplo n.º 39
0
__author__ = 'wangxiaodong'

import urllib.response

#1
response = urllib.request.urlopen('http://www.baidu.com')
html = response.read()

#2
req = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(req)
the_page = response.read()

#3 发送数据
import urllib.pase
import urllib.request

url = 'http://localhost/login.php'
user_agent = 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT'
values = {
    'act':'login',
    'login[email]':'*****@*****.**',
    'login[password]':'123456'
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header('Referer', 'http://www.python.org')
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode("utf8"))
#在Request对象中添加浏览器相关的头信息,把程序伪装成浏览器发送POST请求

#引入parse模块
import urllib.parse
import urllib.response
import urllib.request

url = "http://httpbin.org/post"
#设置浏览器信息
headers = {
    "User-Agent":
    "Mozilla/5.0(Macintosh;Intel Mac OS X 10_13_6) AppleWebKit/537.36(KHTML,like Gecko)Chrome/69.0.3497.100 Safari/537.36"
}
data_dict = {"word": "hello world"}
#将字典类型数据转换成bytes字节流
data = bytes(urllib.parse.urlencode(data_dict), encoding='utf8')

#创建Request对象
request_obj = urllib.request.Request(url=url,
                                     data=data,
                                     headers=headers,
                                     method="POST")
response = urllib.request.urlopen(request_obj)
print(response.read().decode("utf8"))
Exemplo n.º 41
0
import urllib.request, urllib.response

host = 'http://saweather.market.alicloudapi.com'
path = '/area-to-id'
method = 'GET'
appcode = '7793711114fc4bbfb424f818eef8e7e2'
querys = 'area=青岛'
bodys = {}
url = host + path + '?' + querys

request = urllib.request.urlopen(url)
request.add_header('Authorization', 'APPCODE ' + appcode)
response = urllib.response.urlopen(request)
content = response.read()
if (content):
    print(content)
Exemplo n.º 42
0
import urllib.request, urllib.response, http.cookiejar, cookiecutter
from bs4 import BeautifulSoup
url = 'https://www.cnblogs.com/zdlfb/p/6130724.html'
print('第一种方法')
response = urllib.request.urlopen(url)
print(response.getcode())
print(len(response.read()))
print(response.read())
# print('第二张方法')
# request=urllib.request.request(url)
# request.add_header('user-agent','mozilla/5.0')#爬虫伪装成浏览器
# response1=urllib.request.urlopen(request)
# print(response1.getcode())
# print(len(response1.read()))
# print('第三种方法')
# cj=http.cookiejar
# opener=urllib.build_opener(urllib.HttpCookieProcessor(cj))
# urllib.install_opener(opener)
# response3=urllib.request.urlopen(url)
# print(response3.getcode())
# print(response3.read())
soup = BeautifulSoup()  #html文档字符串,html解析器,html文档编码
Exemplo n.º 43
0
def gethtml(url):
    response = urllib.request.urlopen(url)
    return response.read()
Exemplo n.º 44
0
用于处理robot.txt文件
'''
import urllib.request, urllib.response, urllib.parse, urllib.error, urllib.robotparser
import csv
import urllib.request
import codecs

if __name__ == "__main__":
    print("urllib爬取豆瓣网数据示例")
    print("搜索下关键字: Python")

    url = "https://api.douban.com/v2/book/search?q=python"
    response = urllib.request.urlopen(url)

    # 将bytes数据流解码成string
    ebook_str = response.read().decode()

    # 将string转换成dict
    ebook_dict = eval(ebook_str)

    # print(ebook_dict)
    # print(type(ebook_dict))
    count = ebook_dict["count"]
    total = ebook_dict["total"]

    with codecs.open('books.csv', 'w', 'utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["书名", "作者", "描述", "出版社", "价格"])
        # 写书信息
        for book in ebook_dict["books"]:
Exemplo n.º 45
0
    'speak',
    'params': [
        '1.1', {
            'language': 'ja',
            'text': n,
            'voiceType': "*",
            'audioType': "audio/x-wav"
        }
    ]
}

obj_command = json.dumps(tts_command)  # string to json object
obj_command = obj_command.encode('utf-8')
req = urllib.request.Request(tts_url, obj_command)
response = urllib.request.urlopen(req)
received = response.read().decode('utf-8')  # conv bytes to str by decode()
# extract wav file
obj_received = json.loads(received)
tmp = obj_received['result']['audio']  # extract result->audio
speech = base64.decodestring(tmp.encode('utf-8'))

#.waveで出力
f = open("out.wav", 'wb')
f.write(speech)
f.close

#音声合成の音声データを再生
input_filename = 'out.wav'
buffer_size = 4096
wav_file = wave.open(input_filename, 'rb')
p = pyaudio.PyAudio()
Exemplo n.º 46
0
    def crawl(self, link):

        tryOnce = 0
        robotParser = self.setupRobotParser(link)
        if robotParser.can_fetch("*", link):
            while True:
                try:
                    response = urllib.request.urlopen(link)
                    break
                except urllib.error.HTTPError as e:
                    if e.code == 429:
                        if tryOnce == 1:
                            print(
                                'Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' returning.')
                            return
                        print('Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' trying again in 120 seconds.')
                        sleep(120)
                        tryOnce = 1
                    else:
                        return
                # for handling any other url errors
                except:
                    print('Error opening link: ',link, " by thread : ", self.crawlerID)

                    return

            returnedLink = response.geturl()
            if returnedLink != link:
                print('Thread ' + str(self.crawlerID) + ': Redirection:' + link + ' to ' + returnedLink + ' returning.')
                return

            urlInfo = response.info()
            dataType = urlInfo.get_content_type()
            if 'html' not in dataType:
                print('Thread ' + str(self.crawlerID) + ': Not HTML ' + link + ' returning.')
                return

            try:
                webContent = response.read().decode(response.headers.get_content_charset('utf-8'))
            except:
                print("Incomplete Read of web content due to a defective http server.")
                webContent = None

            if(webContent):
                Crawler.webpagesLock.acquire()
                if Crawler.webpagesSaved < NUMOFPAGES:
                    Crawler.webpagesSaved += 1
                else:
                    print('Thread ' + str(self.crawlerID) + ': Page number limit reached ')
                    Crawler.webpagesLock.release()
                    return
                Crawler.webpagesLock.release()
                selector = None
                while True:
                    try:
                        selector = WebPages.select().where(WebPages.pageURL == returnedLink).exists()
                        break
                    except (OperationalError , sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                    except:
                        break

                if selector:
                    print('Thread ' + str(self.crawlerID) + ': Updating webpage ' + link)

                    while True:
                        try:
                            WebPages.update(pageContent=webContent).where(
                                WebPages.pageURL == returnedLink).execute()
                            break
                        except (OperationalError, sqlite3.OperationalError) as e:
                            if 'binding' in str(e):
                                break
                            print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                        except:
                            break

                else:
                    print('Thread ' + str(self.crawlerID) + ': Saving webpage ' + link )
                    try:
                        inserted = False
                        while True:
                            try:
                                if not inserted:
                                    WebPages(pageURL=returnedLink, pageContent=webContent).save()
                                    inserted =  True
                                ...
                                PageRank.create(pageURL=returnedLink).update()
                                ...
                                break
                            except (OperationalError, sqlite3.OperationalError) as e:
                                if 'binding' in str(e):
                                    break
                                print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable & PageRank')
                                sleep(randint(1,5))

                            except:
                                break
                    #should never happen
                    except:
                        print('UnexpectedException: In saving webpage WEEEEEEEEEEEEEEEEEEEEEEE')

                print('Thread ' + str(self.crawlerID) + ': Done saving webpage and starting link extraction ' + link)
                try:
                    parser = MyHTMLParser(link)
                    parser.feed(str(webContent))
                #should never happen
                except:
                    print('UnexpectedException: in parser WEEEEEEEEEEEEEEEEEEEEEEE')

                size = 999
                while True:
                    try:
                        for i in range(0, len(parser.links), size):
                            UncrawledTable.insert_many(parser.links[i:i + size]).upsert().execute()
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. UnCrawledTable')
                    except:
                        break

                while True:
                    try:
                        print("UNCRAWLED URLS = ", UncrawledTable.select().count(), ' Thread ' + str(self.crawlerID))
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. print UnCrawledTable')
                    except:
                        break

                print('Thread ' + str(self.crawlerID) + ': Done inserting links ' + link)
Exemplo n.º 47
0
    def getAnswer(self,answerId):
        host = "http://www.zhihu.com"
        url = host + answerId
        print(url)
        userAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/53.0.2785.116 Safari/537.36"
        #构造Header伪装浏览器
        Header = {"User-Agent" : userAgent}
        #请求该地址
        req = urllib.request.Request(url,headers=Header)
        #得到响应的内容
        try:
            response = urllib.request.urlopen(req , timeout= 20 )
            content = response.read()
            if content is None:
                print("Empty")
                return False
        except:
            print("timeOut,please try again")
            time.sleep(30)
            #try to switch proxy ip
            response = urllib.request.urlopen(req , timeout= 20)
            content = response.read()
            if content is None:
                print("Empty")
                return False

#此时已获取页面的完整代码,接着用BeautifulSoups比正则方便
        try:
            bs = BeautifulSoup(content,"lxml")
        except:
            print("BeautifulSoups Error")
            return False
        #获取该问题的标题
        title = bs.title
        if title is None:
            print("title is Empty")
            return False
        if title.string is None:
            print("string is Empty")
            return False
        fileName_old = title.string.strip()
        # 用来保存内容的文件名,因为文件名不能有一些特殊符号,所以使用正则表达式过滤掉
        fileName= re.sub('[\/:*?"<>|]', '-', fileName_old)
        self.save2file(fileName,title.string) #??????? 为什么此处的content是title.string 而不是 bs ????????????
        # 获取问题的补充内容
        detail = bs.find("div",class_ ="zm-editable-content") #class_ 是BeautifulSoup 的语法
        self.save2file(fileName, "\n\n\n\n--------------------Link %s ----------------------\n\n" % url)
        self.save2file(fileName, "\n\n\n\n--------------------Detail----------------------\n\n")
        if detail is not None:
            for i in detail.strings:
                self.save2file(fileName,i)

        #获取问题的回答

        answers = bs.find_all("div",class_="zm-editable-content clearfix")
        #定义参数
        k = 0
        index = 0
        for each_answer in answers:
            self.save2file(fileName, "\n\n-------------------------answer %s via  -------------------------\n\n" % k)
            #循环获取每一个答案的内容
            for a in each_answer.strings:
                self.save2file(fileName,a)
            k += 1
            index += 1

##################################################################
    #初始化邮箱相关参数
        smtp_server = 'smtp.126.com'
        from_mail = '*****@*****.**'
        password = '******'
        to_mail = '465731912@kindle','*****@*****.**'

        #调用发送邮件的函数
        send_kindle=MailAbout(smtp_server,from_mail,password,to_mail)
        send_kindle.mail_text(fileName)
        print(fileName)
Exemplo n.º 48
0
    def process_custom_monitor_list(self):
        """get and process custom monitor list from monitor URL.
        
        """
        monitor_url = ("http://%s/ida30/svr/cust/CustMonitorAction.do?"
        "method=queryPendingList&getHidFrame=Y&sysModuleId=A90AA1526CEC6022D2C7D2ABE9590308&"
        "eachpagerows=1000" % self.host)
        
        try:
            response = urllib.request.urlopen(monitor_url)
            content = str(response.read())
            start = content.find('<table id="powergrid"')
            if start > 0:
                end = content.find('</table>', start)
                content = content[start:(end + 8)]
                content = content.replace("<br>", "")
                content = content.replace("<font color='red'>", "")
                content = content.replace("<font color='red'  >", "")
                content = content.replace("</font>", "")
                content = content.replace('<img src="/ida30/images/svr/revert.gif" title="待回单">', "1")
                
                # parser monitor list.
                parser = FbHTMLParser()
                parser.feed(content)
                parser.close()
                mbs = parser.get_result()
                if len(mbs) > 0:
                    mbs.remove(mbs[0])

                # parser recordSn list.
                parser = AcceptHTMLParser()
                parser.feed(content)
                parser.close()
                record_sns = parser.get_result()
                
                i = 0
                for mb in mbs:
                    t = datetime(* time.strptime(mb[5], "%Y-%m-%d %H:%M:%S")[:6])
                    t1 = time.mktime(t.timetuple())
                    t2 = time.mktime(datetime.now().timetuple())
                    
                    if mb[9] != "1":
                        if t2 - t1 < 120:
                            self.write_log(("New custom message,Action to recieve.", mb[1], self.uid), "%s [%s] (%s)")
                        elif t2 - t1 > 600: 
                            self.accept_monitor_bill(record_sns[i], mb[1])
                            self.write_log(("Accept custom message automatically.", mb[1], self.uid), "%s [%s] (%s)")
                        elif t2 - t1 > 480:
                            self.write_log(("Accept custom message,Overtime.", mb[1], self.uid), "%s [%s] (%s)")

                    i += 1

                print("[%s] I'm here, Good luck to you! (%s)" %
                      (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), self.uid))
                return self.interval
            else:
                self.write_log(("Error: Get CustomMonitorList fail.", self.uid), "%s (%s)")
                return 0
        except (URLError, HTTPError):
            self.write_log(("Error: Access CustomMonitorList URL fail.", self.uid), "%s (%s)")
            return 0
Exemplo n.º 49
0
if lights_flag:
    # connect to the bridge
    b = Bridge(bridge_ip)
    b.connect()
    # create a light grouping and turn them on
    lr_lamp = [1]
    command = {'on': True, 'bri': 127}
    b.set_light(lr_lamp, command)
    # print(b.get_api())

for n in range(500):

    feed = gtfs_realtime_pb2.FeedMessage()
    response = urllib.request.urlopen(
        'https://gtfs.translink.ca/v2/gtfsposition?apikey=' + API_KEY)
    feed.ParseFromString(response.read())
    green_dist = []
    red_dist = []

    for entity in feed.entity:
        if (entity.HasField('vehicle')
                and (entity.vehicle.trip.route_id == "16718")
                and (entity.vehicle.trip.direction_id == westbound)):
            # print(entity)
            lat_1 = entity.vehicle.position.latitude
            lon_1 = entity.vehicle.position.longitude
            busID = entity.vehicle.vehicle.id
            now = datetime.now()
            bus_checkin_time = datetime.fromtimestamp(
                int(entity.vehicle.timestamp))
Exemplo n.º 50
0
    def get(url: str) -> bytes:

        with urllib.request.urlopen(url) as response:
            return response.read()
Exemplo n.º 51
0
def download_file(download_url, i):
    response = request.urlopen(download_url)
    file = open("./pdf/document" + i + ".pdf", 'wb')
    file.write(response.read())
    file.close()
    print("Completed")
Exemplo n.º 52
0
# -*- coding:utf-8 -*-
import urllib.request
import urllib.response


class RedirectHandler(urllib.request.HTTPRedirectHandler):
    def http_error_301(self, req, fp, code, msg, headers):
        pass

    def http_error_302(self, req, fp, code, msg, headers):
        result = urllib.request.HTTPRedirectHandler.http_error_302(
            self, req, fp, code, msg, headers)
        result.status = code
        result.newurl = result.geturl()
        return result


opener = urllib.request.build_opener(RedirectHandler)
url = "http://www.baidu.com"
response = opener.open(url)
data = response.read().decode()
print(data)
print(response.geturl())
# Create the url to access the api
url = '{0}://{1}{2}'.format(options.protocol, hostname, api_url)

# Create the request
request = urllib.request.Request(url)

# Basic authentication...
credential = base64.b64encode(bytes('%s:%s' % (options.username, options.password), 'ascii'))
request.add_header("Authorization", "Basic %s" % credential.decode('utf-8'))

request.add_header('OCS-APIRequest', 'true')

try:
	with urllib.request.urlopen(request) as response:	
		content = response.read()

except urllib.error.HTTPError as error:      # User is not authorized (401)
    print('UNKOWN - [WEBREQUEST] {0} {1}'.format(error.code, error.reason))
    sys.exit(3)

except urllib.error.URLError as error:	# Connection has timed out (wrong url / server down)
	print('UNKOWN - [WEBREQUEST] {0}'.format(str(error.reason).split(']')[0].strip()))
	sys.exit(3)

try:
	# Convert the webrequest response to xml
	xml_root = xml.etree.ElementTree.fromstring(content)
except xml.etree.ElementTree.ParseError:
	print('UNKOWN - [XML] Content contains no or wrong xml data... check the url and if the api is reachable!')
	sys.exit(3)
Exemplo n.º 54
0
def httpreg(softname, version, phydriverserial, regkey):
    signkey = '&key=0z#z#b#094kls#040jkas892#z#z#b#0'
    data = {}
    data["softname"] = softname
    data["version"] = version
    data["phydriverserial"] = phydriverserial
    data["regkey"] = regkey
    keys = sorted(data)
    src = ""
    for key in keys:
        if len(src):
            src = src + "&"
        src = src + key
        src = src + "="
        src = src + data[key]
    str = src + signkey
    str = str.encode("utf8")
    #phydriverserial=123&regkey=456&softname=cqsc&version=1.0.0.0&key=0z#z#b#094kls#040jkas892#z#z#b#0
    #phydriverserial=123&regkey=456&softname=cqsc&version=1.0.0.0&key=0z#z#b#094kls#040jkas892#z#z#b#0
    m.update(str)
    result = m.hexdigest()
    data = src.encode("utf8")
    url = "http://caiptong.com/share/share_registdeviceid?sign=%s" % (result)
    request = urllib.request.Request(url=url,
                                     data=data,
                                     headers=headers,
                                     method='POST')
    try:
        #response = urllib.request.urlopen(request)
        response = opener.open(request, timeout=5)
        html = response.read().decode()
    except urllib.error.HTTPError as e:
        #print('The server couldn\'t fulfill the request.')
        #print('Error code: ' + str(e.code))
        #print('Error reason: ' + e.reason)
        print("错误", "网络连接错误!")
        return False
    except urllib.error.URLError as e:
        #print('We failed to reach a server.')
        #print('Reason: ' + e.reason)
        print("错误", "网络连接错误!")
        return False
    except Exception as msg:
        print("Exception:%s" % msg)
        return False
    except:
        #print("error lineno:" + str(sys._getframe().f_lineno))
        print("错误", "网络连接错误!")
        return False
    html = html.strip()
    #print(html)
    json_data = json.loads(html)
    #{"msg":"登录成功.","success":true,"datas":{"ckregkey":false,"topics":[],"userid":175}}
    if json_data["success"] != True:
        print("错误", "账号未注册!")
        if "datas" in json_data:
            print("错误", json_data["datas"]["notice"])
        return False
    else:
        if "datas" in json_data:
            datas = json_data["datas"]
            if "ckregkey" in datas:
                if datas["ckregkey"] == True:
                    return True
        return False

with open ("mexica.txt", "r") as ff:
    story = ff.read()
print(story)

list_of_words = nltk.word_tokenize(story)

tagged_words = nltk.pos_tag(list_of_words)

list_adj = [x for x, i in tagged_words if i == "JJ"]
metaphor_dict = {}
for i in list_adj:
    url = "http://bonnat.ucd.ie/jigsaw/index.jsp?q=" + i
    with request.urlopen(url) as response:
        page_source = response.read()
    x = {i: [y for y in re.findall('longvehicle=(.*?)">', str(page_source))]}
    metaphor_dict.update(x)

connectors = ['like', 'as']
list_of_words_2 = nltk.word_tokenize(story)
for i, j in enumerate(list_of_words):
    if j in list_adj:
        if len(metaphor_dict.get(j)) == 0:
            continue
        else:
            y = str(random.choice(connectors) + " " + random.choice(metaphor_dict[str(j)]))
            for z, k in enumerate(list_of_words_2):
                if k == j:
                    list_of_words_2.insert(z+1, y)
                    break
Exemplo n.º 56
0
    def __init__(self):
        self.token = 'unknown'

    def blue_open(self, req):  #<protocole>_open
        print('Blue BlueSchemeHandler')
        url = req.get_full_url()
        scheme, data = url.split(':', 1)

        headers = {}
        newURL = 'https:' + data
        #newURL = urllib.parse.unquote_to_bytes(newURL)
        newReq = urllib.request.Request(newURL)
        fp = urllib.request.urlopen(newReq)

        return urllib.response.addinfourl(fp, headers, url)


myAuthProxy = DatasetAuthProxy()
blueSchemeHandler = BlueSchemeHandler()
opener = urllib.request.build_opener(myAuthProxy, blueSchemeHandler)
urllib.request.install_opener(opener)

# Client side test - With the \blue\ protocole / scheme \n,
#response = urllib.request.urlopen('blue://www.lefigaro.fr')
#print('Response:\n')
#print(response.read())

f = open('blue://www.lefigaro.fr')
print('Response:\n')
print(response.read())
def getStockMetricsData(stockDataUrl):
    #stockMetricsData = {}
    stockValueMetricsData = {}
    stockPrice = 0
    filteredStockMetricsData=''
    try:
        req = urllib.request.Request(
            stockDataUrl, data=None,
            headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
            }
        )
        response = urllib.request.urlopen(req, timeout=10)
    except ssl.SSLError as err:
        print('SSLError: Socket Connection timed out with error: ', err)
        return ''
    except urllib.error.HTTPError as e:
        print('HTTPError: The server couldn\'t fulfill the request. Error code: ', e.code)
        return ''
    except urllib.error.URLError as e:
        print('URLError: We failed to reach a server. Reason: ', e.reason)
        return ''
    except:
        print('An error occurred while accessing: ', stockDataUrl)
        return ''

    #html = requests.get(stockDataUrl, timeout=5).read()
    #html = urllib.request.urlopen(stockDataUrl).read()
    html=response.read()
    soup = BeautifulSoup(html, features="lxml")
    data = soup.findAll(text=True)
    result = filter(visible, data)
    items_list=list(result)
    print(items_list)

    #Order of keys: DoubleSpacedKeys1 SingleSpacedKeys2 TripleSpacedPriceKeys3 DoubleSpacedKeys4 NoSpacedKeys5 DoubleSpacedKeys6
    #print(DoubleSpacedKeys1)
    #DoubleSpacedKeys1 = [' Market Cap $: ', ' Enterprise Value $: ', ' Volume: ', ' Avg Vol (1m): ']
    for keyFeature in DoubleSpacedKeys1:
        keyData=find_element_in_list(keyFeature, items_list,2)
        # print(keyFeature.replace(',', '').replace(' ', '').replace('\n', ''), keyData)
        # Add any filtering conditions here
        if (keyData != 0):
            if (keyFeature == DoubleSpacedKeys1[0] and keyData < MIN_MARKET_CAP_MIL):
                return ''
            if (keyFeature == DoubleSpacedKeys1[2] and keyData < MIN_TRADE_VOL):
                return ''
            if (keyFeature == DoubleSpacedKeys1[3] and keyData < MIN_AVG_TRADE_VOL):
                return ''

        filteredStockMetricsData += str(keyData) + ','
        #stockMetricsData[keyFeature.replace(',', '').replace(':', '').replace(' ', '').replace('\n', '')] = keyData

    # SingleSpaced fields: '\nPrice: ', ' $3.76\n',
    #print(SingleSpacedKeys2)
    for keyFeature in SingleSpacedKeys2:
        keyData=find_element_in_list(keyFeature, items_list,1)
        #Extra field for PriceChange to be added manually right before Price field
        if keyFeature == '\nPrice: ':
            # '\nWipro Ltd\n$\n3.76\n', ' -0.04 (-1.05%)\n', ' ', ' ', '\n', ' ', ' Volume: ',
            extraFeature = ' Volume: '
            if (keyData == 0):
                keyData = find_element_in_list(extraFeature, items_list, -6)
            stockPrice = keyData
            priceChangePercent = find_element_in_list(extraFeature, items_list, -5)
            # print('1DayPriceChange %', priceChangePercent)
            filteredStockMetricsData += str(priceChangePercent) + ','
            #stockMetricsData['1DayPriceChangePercent'] = priceChangePercent

        # print(keyFeature.replace(',', '').replace(' ', '').replace('\n', ''), keyData)
        # Add any filtering conditions here
        if (keyData != 0) and (keyFeature == SingleSpacedKeys2[0]) and (keyData < MIN_STOCK_PRICE):
            return ''

        filteredStockMetricsData += str(keyData)+','
        #stockMetricsData[keyFeature.replace(',', '').replace(':', '').replace(' ', '').replace('\n', '')] = keyData

    #print(TripleSpacedPriceKeys3)
    #['\nEarnings Power Value\n', '\nNet Current Asset Value\n', '\nTangible Book\n','\nMedian P/S Value\n',
    # '\nGraham Number\n','\nPeter Lynch Value\n','\nDCF (Earnings Based)\n','\nDCF (FCF Based)\n','\nProjected FCF\n']
    for keyFeature in TripleSpacedPriceKeys3:
        keyData=find_element_in_list(keyFeature, items_list,3)
        stockValueMetricsData[keyFeature] = keyData
        # print(keyFeature.replace(',', '').replace(' ', '').replace('\n', ''), keyData)
        # Add any filtering conditions here
        #filteredStockMetricsData += str(keyData) + ','
        #stockMetricsData[keyFeature.replace(',', '').replace(':', '').replace(' ', '').replace('\n', '')] = keyData

    #print(DoubleSpacedKeys4)
    #['\nPB Ratio\n', '\nPrice-to-Tangible-Book\n', '\nPS Ratio\n', '\nPrice-to-Median-PS-Value\n', '\nPrice-to-Graham-Number\n', '\nPrice-to-Peter-Lynch-Fair-Value\n',
    # 6: '\nPrice-to-Intrinsic-Value-DCF (Earnings Based)\n','\nPrice-to-Intrinsic-Value-DCF (FCF Based)\n','\nPrice-to-Intrinsic-Value-Projected-FCF\n','\nPrice-to-Earnings-Power-Value\n',
    # 10: '\nPEG Ratio\n', '\nCurrent Ratio\n', '\nQuick Ratio\n', '\nCash-To-Debt\n', '\nEquity-to-Asset\n',
    # 15: '\nDebt-to-Equity\n', '\nDebt-to-EBITDA\n','\nOperating Margin %\n', '\nNet Margin %\n', '\nROE %\n', '\nROA %\n']
    for keyFeature in DoubleSpacedKeys4:
        keyData=find_element_in_list(keyFeature, items_list,2)
        #print(keyFeature.replace(',', '').replace(' ', '').replace('\n', ''), keyData)

        # Use previously extracted data as a fallback, when one of the metric is not available
        if (keyData == 0):
            if ((keyFeature == DoubleSpacedKeys4[1]) and stockValueMetricsData[TripleSpacedPriceKeys3[2]] != 0):
                keyData = stockPrice/stockValueMetricsData[TripleSpacedPriceKeys3[2]]
            elif ((keyFeature == DoubleSpacedKeys4[3]) and stockValueMetricsData[TripleSpacedPriceKeys3[3]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[3]]
            elif((keyFeature == DoubleSpacedKeys4[4]) and stockValueMetricsData[TripleSpacedPriceKeys3[4]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[4]]
            elif((keyFeature == DoubleSpacedKeys4[5]) and stockValueMetricsData[TripleSpacedPriceKeys3[5]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[5]]
            elif((keyFeature == DoubleSpacedKeys4[6]) and stockValueMetricsData[TripleSpacedPriceKeys3[6]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[6]]
            elif((keyFeature == DoubleSpacedKeys4[7]) and stockValueMetricsData[TripleSpacedPriceKeys3[7]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[7]]
            elif((keyFeature == DoubleSpacedKeys4[8]) and stockValueMetricsData[TripleSpacedPriceKeys3[8]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[8]]
            elif((keyFeature == DoubleSpacedKeys4[9]) and stockValueMetricsData[TripleSpacedPriceKeys3[0]] != 0):
                keyData = stockPrice / stockValueMetricsData[TripleSpacedPriceKeys3[0]]

        # Add any filtering conditions here
        if (keyData != 0):
            if (keyFeature == DoubleSpacedKeys4[0]) and (keyData < MIN_PB_RATIO or keyData > MAX_PB_RATIO):
                return ''
            # Not reliable, so disabling
            if (keyFeature == DoubleSpacedKeys4[1]) and (keyData < MIN_PRICE_TO_TANGIBLE_BOOK_RATIO or keyData > MAX_PRICE_TO_TANGIBLE_BOOK_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys4[2]) and (keyData < MIN_PS_RATIO or keyData > MAX_PS_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys4[3]) and (keyData < MIN_PRICE_TO_MEDIAN_PS_VALUE or keyData > MAX_PRICE_TO_MEDIAN_PS_VALUE):
                return ''
            if (keyFeature == DoubleSpacedKeys4[4]) and (keyData < MIN_PRICE_TO_GRAHAM_NUMBER or keyData > MAX_PRICE_TO_GRAHAM_NUMBER):
                return ''
            if (keyFeature == DoubleSpacedKeys4[5]) and (keyData < MIN_PRICE_TO_PETER_LYNCH_VALUE or keyData > MAX_PRICE_TO_PETER_LYNCH_VALUE):
                return ''
            if (keyFeature == DoubleSpacedKeys4[6]) and (keyData < MIN_PRICE_TO_INTRINSIC_VALUE_EARNINGS_DCF or keyData > MAX_PRICE_TO_INTRINSIC_VALUE_EARNINGS_DCF):
                return ''
            if (keyFeature == DoubleSpacedKeys4[7]) and (keyData < MIN_PRICE_TO_INTRINSIC_VALUE_FCF_DCF or keyData > MAX_PRICE_TO_INTRINSIC_VALUE_FCF_DCF):
                return ''
            if (keyFeature == DoubleSpacedKeys4[8]) and (keyData < MIN_PRICE_TO_PROJECTED_FCF_VALUE or keyData > MAX_PRICE_TO_PROJECTED_FCF_VALUE):
                return ''
            if (keyFeature == DoubleSpacedKeys4[9]) and (keyData < MIN_PRICE_TO_EARNINGS_POWER_VALUE or keyData > MAX_PRICE_TO_EARNINGS_POWER_VALUE):
                return ''
            if (keyFeature == DoubleSpacedKeys4[10]) and (keyData > MAX_PEG_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys4[11]) and (keyData < MIN_CURRENT_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys4[12]) and (keyData < MIN_QUICK_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys4[13]) and (keyData < MIN_CASH_TO_DEBT):
                return ''
            if (keyFeature == DoubleSpacedKeys4[15]) and (keyData > MAX_DEBT_TO_EQUITY):
                return ''
            if (keyFeature == DoubleSpacedKeys4[16]) and (keyData > MAX_DEBT_TO_EBITDA):
                return ''
            if (keyFeature == DoubleSpacedKeys4[17]) and (keyData < MIN_OPERATING_MARGIN_PERCENT):
                return ''
            if (keyFeature == DoubleSpacedKeys4[18]) and (keyData < MIN_NET_MARGIN_PERCENT):
                return ''
            if (keyFeature == DoubleSpacedKeys4[19]) and (keyData < MIN_ROE_PERCENT):
                return ''
            if (keyFeature == DoubleSpacedKeys4[20]) and (keyData < MIN_ROA_PERCENT):
                return ''

        filteredStockMetricsData += str(keyData)+','
        #stockMetricsData[keyFeature.replace(',', '').replace(':', '').replace(' ', '').replace('\n', '')] = keyData

    #print(QudrapleSpacedPriceKeys5)
    # Example: '\nWACC vs ROIC\n', ' ', ' ', ' ', '\nROIC 28.08%\n', '\nWACC 2.85%\n',
    for keyFeature in QudrapleSpacedPriceKeys5:
        keyData=find_element_in_list(keyFeature, items_list,4)
        # print('ROIC', keyData)
        # Add any filtering conditions here
        filteredStockMetricsData += str(keyData) + ','
        #stockMetricsData['ROIC'] = keyData

        keyData = find_element_in_list(keyFeature, items_list, 5)
        # print('WACC', keyData)
        #Add any filtering conditions here
        filteredStockMetricsData += str(keyData) + ','
        #stockMetricsData['WACC'] = keyData

    #print(DoubleSpacedKeys6)
    # 0: ['\nShiller PE Ratio\n','\nPE Ratio\n','\nForward PE Ratio\n','\nPE Ratio without NRI\n','\nEV-to-EBIT\n','\nEV-to-EBITDA\n','\nEV-to-Revenue\n',
    # 7:  '\n3-Year Revenue Growth Rate\n','\n3-Year EBITDA Growth Rate\n','\n3-Year EPS without NRI Growth Rate\n',
    # 10: '\nPrice-to-Owner-Earnings\n','\nPrice-to-Free-Cash-Flow\n','\nPrice-to-Operating-Cash-Flow\n',
    # 13: '\nPiotroski F-Score\n', '\nAltman Z-Score\n', '\nBeneish M-Score\n', '\nFinancial Strength\n','\nProfitability Rank\n', '\nValuation Rank\n']
    for keyFeature in DoubleSpacedKeys6:
        keyData=find_element_in_list(keyFeature, items_list,2)
        # print(keyFeature.replace(',', '').replace(' ', '').replace('\n', ''), keyData)
        # Add any filtering conditions here
        if keyData != 0:
            if (keyFeature == DoubleSpacedKeys6[0]) and (keyData > MAX_SHILLER_PE_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys6[1]) and (keyData > MAX_PE_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys6[2]) and (keyData > MAX_FORWARD_PE_RATIO):
                return ''
            if (keyFeature == DoubleSpacedKeys6[4]) and (keyData < MIN_EV_to_EBIT or keyData > MAX_EV_to_EBIT) :
                return ''
            if (keyFeature == DoubleSpacedKeys6[7]) and (keyData < MIN_3YR_REV_GROWTH_RATE):
                return ''
            if (keyFeature == DoubleSpacedKeys6[10]) and (keyData > MAX_PRICE_TO_OWNER_EARNINGS):
                return ''
            if (keyFeature == DoubleSpacedKeys6[11]) and (keyData > MAX_PRICE_TO_FREE_CASH_FLOW):
                return ''
            if (keyFeature == DoubleSpacedKeys6[12]) and (keyData > MAX_PRICE_TO_OPERATING_CASH_FLOW):
                return ''

        filteredStockMetricsData += str(keyData) + ','
        #stockMetricsData[keyFeature.replace(',', '').replace(':', '').replace(' ', '').replace('\n', '')] = keyData

    return filteredStockMetricsData
Exemplo n.º 58
0
用于处理robot.txt文件
'''
import urllib.request, urllib.response, urllib.parse, urllib.error, urllib.robotparser
import csv
import urllib.request
import codecs

if __name__ == "__main__":
    print("urllib爬取豆瓣网数据示例")
    print("搜索下关键字: Python")

    url = "https://api.douban.com/v2/book/search?q=python"
    response = urllib.request.urlopen(url)

    # 将bytes数据流解码成string
    ebook_str = response.read().decode()

    # 将string转换成dict
    ebook_dict = eval(ebook_str)

    # print(ebook_dict)
    # print(type(ebook_dict))
    count = ebook_dict["count"]
    total = ebook_dict["total"]

    with codecs.open('books.csv', 'w', 'utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow(["书名", "作者", "描述", "出版社", "价格"])
        # 写书信息
        for book in ebook_dict["books"]:
Exemplo n.º 59
0
import urllib.request
import urllib.response

url = 'http://www.yahoo.com'

request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
the_page = response.read()
print(the_page.decode())


#shortened version...
the_page = urllib.request.urlopen('http://www.yahoo.com').read()


from urllib.parse import urlparse
result = urlparse('https://docs.python.org/3/library/index.html')
print(result)
Exemplo n.º 60
0
    def __init__(self, code):
        """
        构造
        :param code: code
        """
        self.__response = {}
        try:
            # 通过 code 获取 access_token
            response = urllib.request.urlopen(
                'https://graph.qq.com/oauth2.0/token?' +
                    'grant_type=authorization_code' +
                    '&client_id=' + qq_param['client_id'] +
                    '&client_secret=' + qq_param['client_secret'] +
                    '&code=' + code +
                    '&redirect_uri=' + qq_param['callback']
            )
            # 提取 access_token
            access_token = response.read().decode('utf-8').split('&')[0].split('=')[1]
            # 使用 access_token 获取用户的 openid
            response = urllib.request.urlopen(
                'https://graph.qq.com/oauth2.0/me?' +
                    'access_token=' + access_token
            )

            # 解码成 Python 对象
            json_obj = json.loads(response.read().decode('utf-8').replace('callback( ', '').replace(' );', ''))
            open_id = json_obj['openid']

            # 使用 openid 获取用户的信息
            response = urllib.request.urlopen(
                'https://graph.qq.com/user/get_user_info?' +
                    'access_token=' + access_token +
                    '&oauth_consumer_key=' + qq_param['client_id'] +
                    '&openid=' + open_id
            )
            # 解码成 Python 对象
            user_info = json.loads(response.read().decode('utf-8'))

            # 使用 user_info 中的 nickname 来校验是否登录成功
            if user_info['nickname']:
                self.__response['success'] = True
                # 查询数据库,看数据库中是否已经有用户数据了
                if KUser.objects.filter(user_type='qq', uid=open_id).exists():
                    # 如果已经有了,就直接取用户出来
                    k_user = KUser.objects.get(user_type='qq', uid=open_id)
                else:
                    # 如果没有,那么就将用户存入数据库
                    k_user = KUser(
                        user_type='qq',
                        uid=open_id,
                        nickname=user_info['nickname'],
                        avatar=user_info['figureurl_qq_1'],
                        is_admin=False
                    )
                    k_user.save()
                self.__response['user_info'] = {
                    'user_type': k_user.user_type,
                    'uid': k_user.uid,
                    'nickname': k_user.nickname,
                    'avatar': k_user.avatar,
                    'is_admin': k_user.is_admin,
                    'pk': k_user.pk
                }
            else:
                self.__response['success'] = False
        except Exception:
            self.__response['success'] = False
            return