Пример #1
0
def you_get(url, print_info, extra_args):
    try:
        command = ['you-get', '-u']
        if print_info:
            command.append('-i')
        if extra_args:
            command.append(extra_args)
        command.append(url)
        process = subprocess.Popen(command, stdout=subprocess.PIPE)
        try:
            output = process.communicate()[0]
            output = output.decode(chardet.detect(output).get('encoding', 'utf-8'), 'replace')
        except KeyboardInterrupt:
            process.terminate()
            return '', []
        if print_info:
            print(output)
            return '', []
        name_match = re.compile(r'title:\s*(.*?)(\r|\n)').search(output)
        name = name_match.group(1) if name_match else 'Unknown'
        url_re = re.compile(r'(http.*?)(\r|\n)')
        url_match = url_re.search(output)
        video_url = []
        while url_match:
            video_url.append(url_match.group(1))
            url_match = url_re.search(output, url_match.end(0))
        return name, video_url
    except Exception as e:
        logger.error('parse video failed {}'.format(e))
        return '', []
Пример #2
0
def get_content_ip(url, proxy, data=None):
    """使用代理IP获取网页中的html代码"""
    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
    }

    # 创建ProxyHandler
    proxy_support = request.ProxyHandler(proxy)
    # 创建opener
    opener = request.build_opener(proxy_support)
    # 安装opener
    request.install_opener(opener)
    # 随机超时时间
    timeout = random.choice(range(80, 180))
    while True:
        try:
            response = request.urlopen(url, timeout=timeout)
            html = response.read()
            c = chardet.detect(html)
            #             print(c)
            print(response.status)
            #             html = html.decode(c["encoding"])
            break
        # 超时异常
        except error.HTTPError as e:
            print('HTTPError: NoResouse', e)
            time.sleep(random.choice(range(10, 30)))
        except error.URLError as e:
            print('URLError: NoSiteExcit', e)
            time.sleep(random.choice(range(10, 30)))

    return html
Пример #3
0
def check_public_ip_info():
    url = 'https://checkip.amazonaws.com'
    try:
        opener = util.http_opener()
        response = opener.open(url, timeout=2000)

        # 字节码
        resp_bytes = response.read()
        # print("A\n")
        # print(resp_bytes)

        charset = chardet.detect(resp_bytes)['encoding']
        # print("B\n")
        # print(charset)

        # 转成字符串
        # html = resp_bytes.decode(charset)
        content = str(resp_bytes, charset)
        if content is not None:
            _content = content.replace('\n', '')
            if is_ipv4_address(_content):
                return _content

    except Exception as e:
        print(e)
        logger.log(e)
Пример #4
0
def get_content(url, data=None):
    """获取网页中的html代码"""
    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
    }

    # 随机超时时间
    timeout = random.choice(range(80, 180))
    while True:
        try:
            req = request.Request(url, headers=header)
            response = request.urlopen(req, timeout=timeout)
            html = response.read()
            c = chardet.detect(html)
            #             print(c)
            print(response.status)
            html = html.decode(c["encoding"])
            break
        # 超时异常
        except error.HTTPError as e:
            print('HTTPError: NoResouse', e)
            time.sleep(random.choice(range(10, 30)))
        except error.URLError as e:
            print('URLError: NoSiteExcit', e)
            time.sleep(random.choice(range(10, 30)))

    return html
Пример #5
0
def get_public_ip_info():
    """
    请求ip138来返回外网ip地址和地址信息

    :return:
    """
    url = 'http://2019.ip138.com/ic.asp'

    try:
        opener = util.http_opener()
        response = opener.open(url, timeout=2000)

        # 字节码
        resp_bytes = response.read()

        charset = chardet.detect(resp_bytes)['encoding']

        # 转成字符串
        html = resp_bytes.decode(charset)

        if 'html' in response.getheader('Content-Type'):
            info_reg = r'(?<=<center>).*?(?=</center>)'
            info_all_match = re.findall(info_reg, html, re.S | re.M)
            info = info_all_match[0]
            # logger.log(info)

            ip_reg = r'(?<=\[).*?(?=\])'
            ip_matched = re.findall(ip_reg, info, re.S | re.M)

            dict = {'ip': ip_matched[0], 'info': info}
            return dict

    except Exception as e:
        print(e)
        logger.log(e)
Пример #6
0
def getPlayerInfo(playerid):
    html = getHtml('www.csgola.com', '/player/' + playerid)
    q = pyq(html)
    avatar = q('img.avatar.center-block.img-responsive').attr('src')
    playername = q('.personaname').text()
    statTit = q('.col-md-10 .title').text().encode('utf-8')
    statVal = q('.col-md-10 .datala')
    chartVal = q('.polar-detail .datala')
    statiscsName = q('.list-group .list-group-item span.stats-title')
    staticsData = q('.list-group .list-group-item span.stats-count.pull-right')
    print pyq(statiscsName[0]).text()
    print(chardet.detect(pyq(statiscsName[0]).text().encode('utf-8')))
    json = {
        'error': 0,
        'playerinfo': {
            'avatar': avatar,
            'name': playername,
        },
        'stats': {
            'jishashu':       pyq(statVal[0]).text().encode('utf-8'),
            'baotoulv':       pyq(statVal[1]).text().encode('utf-8'),
            'kd':             pyq(statVal[2]).text().encode('utf-8'),
            'shenglv':        pyq(statVal[3]).text().encode('utf-8'),
            'zhengwangshu':   pyq(statVal[4]).text().encode('utf-8'),
            'mingzhonglv':    pyq(statVal[5]).text().encode('utf-8'),
            'juanzengwuqi':   pyq(statVal[6]).text().encode('utf-8'),
            'mvpcishu':       pyq(statVal[7]).text().encode('utf-8'),
        },
        'chart': {
            'zonghe':          pyq(chartVal[0]).text().encode('utf-8'),
            'kd':              pyq(chartVal[1]).text().encode('utf-8'),
            'mingzhonglv':     pyq(chartVal[2]).text().encode('utf-8'),
            'baotoulv':        pyq(chartVal[3]).text().encode('utf-8'),
            'shenglv':         pyq(chartVal[4]).text().encode('utf-8'),
        },
        'staData': {
            pyq(statiscsName[0]).text(): pyq(staticsData[0]).text(),
            pyq(statiscsName[1]).text(): pyq(staticsData[1]).text(),
            pyq(statiscsName[2]).text(): pyq(staticsData[2]).text(),
            pyq(statiscsName[3]).text(): pyq(staticsData[3]).text(),
            pyq(statiscsName[4]).text(): pyq(staticsData[4]).text(),
            pyq(statiscsName[5]).text(): pyq(staticsData[5]).text(),
            pyq(statiscsName[6]).text(): pyq(staticsData[6]).text(),
            pyq(statiscsName[7]).text(): pyq(staticsData[7]).text(),
            pyq(statiscsName[8]).text(): pyq(staticsData[8]).text(),
            pyq(statiscsName[9]).text(): pyq(staticsData[9]).text(),
            pyq(statiscsName[10]).text(): pyq(staticsData[10]).text(),
            pyq(statiscsName[11]).text(): pyq(staticsData[11]).text(),
            pyq(statiscsName[12]).text(): pyq(staticsData[12]).text(),
            pyq(statiscsName[13]).text(): pyq(staticsData[13]).text(),
            pyq(statiscsName[14]).text(): pyq(staticsData[14]).text(),
            pyq(statiscsName[15]).text(): pyq(staticsData[15]).text(),
            pyq(statiscsName[16]).text(): pyq(staticsData[16]).text(),
            pyq(statiscsName[17]).text(): pyq(staticsData[17]).text(),
            pyq(statiscsName[18]).text(): pyq(staticsData[18]).text(),
        }
    }
    return json
Пример #7
0
def getProvince(mainUrl):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    req = urllib2.Request(mainUrl)
    resp = urllib2.urlopen(req)
    respHtml = resp.read()
    char_type = chardet.detect(respHtml)
    #
    print char_type
    respHtml = unicode(respHtml, "GBK").encode("utf8")
    # pattern = re.compile(u'<ul class="interval01-list">')
    # results=respHtml[respHtml.rfind('<ul class="interval01-list">') +1 :respHtml.rfind('<!--有参数配置 start-->')]
    print respHtml

    print getProvince("https://car.autohome.com.cn/price/brand-25.html")
Пример #8
0
def get_public_ip_info_2():
    url = 'http://ip.chinaz.com'
    try:
        opener = util.http_opener()
        response = opener.open(url, timeout=2000)

        # 字节码
        resp_bytes = response.read()
        # print("A\n")
        # print(resp_bytes)

        charset = chardet.detect(resp_bytes)['encoding']
        # print("B\n")
        # print(charset)

        # 转成字符串
        html = resp_bytes.decode(charset)
        # print(html)

        if 'html' in response.getheader('Content-Type'):
            # 解析html
            # re_comp = re.compile('(?<=<dl class="IpMRig-tit">).*?(?=</dl>)')
            # all_match = re_compfindall(html)

            info_reg = r'(?<=<dl class="IpMRig-tit">).*?(?=</dl>)'
            info_all_match = re.findall(info_reg, html, re.S | re.M)
            info = info_all_match[0]
            # logger.log(info)

            ip_reg = r'(?<=<dd class="fz24">).*?(?=</dd>)'
            ip_matched = re.findall(ip_reg, info, re.S | re.M)

            dict = {'ip': ip_matched[0], 'info': info}
            return dict

    except Exception as e:
        print(e)
        logger.log(e)
Пример #9
0
    def craw(self):
        start_time = time.time()

        domain = self.data.get_domain(self.domainpath)
        if domain is None:
            self.filelogs.writeLogs('domain is None')
            exit()

        self._domain_array(domain)

        if self.domain_id and self.domain_id is not None:
            self.filelogs.writeFile(self.domainpath, self.domain_id)

        self.urls.add_new_url(self.domain)
        self.filelogs.writeLogs('采集开始  ID:%d' % (self.domain_id))

        #print domain
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()

                #补全链接中的http
                new_full_url = self.parser.get_http_url(new_url)

                #new_full_url = 'http://www.lingdiankanshu.com/html/1/1966/'
                #new_full_url = 'http://www.wn001.com/book/2284/'
                #new_full_url = 'http://www.zhukeshu.com'

                self.filelogs.writeLogs('下载页面开始')
                html_cont = self.downloader.download(new_full_url)
                self.filelogs.writeLogs('下载页面结束')

                #html_cont = self.downloader.download('http://www.wn001.com')
                if chardet.detect(
                        html_cont)['encoding'] == 'GB2312' or chardet.detect(
                            html_cont)['encoding'] == 'gb2312':
                    iconv_type = 'gbk'
                else:
                    iconv_type = chardet.detect(html_cont)['encoding']
                #print chardet.detect(html_cont)
                #str = unicode(html_cont,'gbk')
                html_cont_coding = html_cont.decode(iconv_type).encode('utf-8')

                #判断url是否是小说目录url,True则采集该小说保存入数据库
                book_preg = urlparse.urljoin(new_full_url, self.book_regular)

                #是否是小说目录url
                if self.parser.is_book_link(new_full_url, book_preg):
                    #获取book_id
                    book_id = self.parser.get_book_id(new_full_url, book_preg)
                    book_id = int(book_id)
                    #判断该book_id是否已采集过
                    if self.data.is_collect(
                            book_id, self.domain_id) is False:  #该book_id没采集过
                        #获取小说名称
                        book_name = self.parser.get_book_name(
                            html_cont_coding, self.bookname_regular)

                        #获取作者名称
                        author = self.parser.get_author(
                            html_cont_coding, self.author_regular)

                        #获取简介
                        descript = self.parser.get_descript(
                            html_cont_coding, self.descript_regular)

                        if book_name is not None and author is not None and descript is not None:
                            #把小说信息存入数据库
                            self.data.save_book(self.domain_id, book_id,
                                                book_name, author, descript)

                self.filelogs.writeLogs('解析内容')
                new_urls = self.parser.parse(
                    new_url, html_cont_coding,
                    self.parser.get_http_url(self.domain))

                self.urls.add_new_urls(new_urls)
                #                 self.filelogs.writeLogs(str(count))
                #                 if count == 50:
                #                     break
                #                 count = count + 1
                #exit()
                if (time.time() - start_time) > self.outtime:
                    self.filelogs.writeLogs('采集结束')
                    break
            except:
                self.filelogs.writeLogs('Faild:')
Пример #10
0
from datetime import datetime

header = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
}

url = "http://www.weather.com.cn/weather/101020100.shtml"
# 随机超时时间
timeout = random.choice(range(80, 180))
while True:
    try:
        req = request.Request(url, headers=header)
        response = request.urlopen(req, timeout=timeout)
        html = response.read()
        c = chardet.detect(html)
        #             print(c)
        print("Status:", response.status)
        html = html.decode(c["encoding"])
        break
    # 超时异常
    except error.HTTPError as e:
        print('HTTPError: NoResouse', e)
        time.sleep(random.choice(range(10, 30)))
    except error.URLError as e:
        print('URLError: NoSiteExcit', e)
        time.sleep(random.choice(range(10, 30)))

final_info = []

# 创建bs4实例
Пример #11
0
    def craw(self):
        start_time = time.time()
        
        domain = self.data.get_domain(self.domainpath)
        if domain is None:
            self.filelogs.writeLogs('domain is None')
            exit()
        
        self._domain_array(domain)
        
        if self.domain_id and self.domain_id is not None:
            self.filelogs.writeFile(self.domainpath, self.domain_id)
        
        self.urls.add_new_url(self.domain)
        self.filelogs.writeLogs('采集开始  ID:%d'%(self.domain_id))
        
        #print domain
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url()
                
                #补全链接中的http
                new_full_url = self.parser.get_http_url(new_url)
                
                #new_full_url = 'http://www.lingdiankanshu.com/html/1/1966/'
                #new_full_url = 'http://www.wn001.com/book/2284/'
                #new_full_url = 'http://www.zhukeshu.com'
                
                self.filelogs.writeLogs('下载页面开始')
                html_cont = self.downloader.download(new_full_url)
                self.filelogs.writeLogs('下载页面结束')
                
                #html_cont = self.downloader.download('http://www.wn001.com')
                if chardet.detect(html_cont)['encoding'] == 'GB2312' or chardet.detect(html_cont)['encoding'] == 'gb2312':
                    iconv_type = 'gbk'
                else:
                    iconv_type = chardet.detect(html_cont)['encoding']
                #print chardet.detect(html_cont)
                #str = unicode(html_cont,'gbk')
                html_cont_coding =  html_cont.decode(iconv_type).encode('utf-8')
                 
                #判断url是否是小说目录url,True则采集该小说保存入数据库
                book_preg = urlparse.urljoin(new_full_url, self.book_regular)
                 
                #是否是小说目录url
                if self.parser.is_book_link(new_full_url, book_preg):
                    #获取book_id
                    book_id = self.parser.get_book_id(new_full_url, book_preg)
                    book_id = int(book_id)
                    #判断该book_id是否已采集过
                    if self.data.is_collect(book_id, self.domain_id) is False:  #该book_id没采集过
                        #获取小说名称
                        book_name = self.parser.get_book_name(html_cont_coding, self.bookname_regular)
                         
                        #获取作者名称
                        author = self.parser.get_author(html_cont_coding, self.author_regular)
                         
                        #获取简介
                        descript = self.parser.get_descript(html_cont_coding, self.descript_regular)
                         
                        if book_name is not None and author is not None and descript is not None:
                            #把小说信息存入数据库
                            self.data.save_book(self.domain_id, book_id, book_name, author, descript)
                    
                    
                self.filelogs.writeLogs('解析内容')
                new_urls = self.parser.parse(new_url, html_cont_coding, self.parser.get_http_url(self.domain))  
                 
                self.urls.add_new_urls(new_urls)
#                 self.filelogs.writeLogs(str(count))
#                 if count == 50:
#                     break
#                 count = count + 1
                #exit()
                if (time.time() - start_time) > self.outtime:
                    self.filelogs.writeLogs('采集结束')
                    break
            except:
                self.filelogs.writeLogs('Faild:')
Пример #12
0
# 常用第三方模块---chardet
from pip._vendor.requests.packages import chardet

result = chardet.detect(b'Hello, world!')
print(result)

# gbk
data = '离离原上草,一岁一枯荣'.encode('gbk')
result = chardet.detect(data)
print(result)

#utf-8
data = '离离原上草,一岁一枯荣'.encode('utf-8')
result = chardet.detect(data)
print(result)

#对日文进行检测
data = '最新の主要ニュース'.encode('euc-jp')
result = chardet.detect(data)
print('日文:%s' % result)
Пример #13
0
print(qiuhe(100))

# with request.urlopen('https://api.douban.com/v2/book/2129650') as f:
#     data = f.read()
#     print("Status:", f.status, f.reason)
#     for k, v in f.getheaders():
#         print('%s:%s' % (k, v))
#     d = json.loads(data.decode('utf-8'))
#     for p, s in d.items():
#         print('%s:%s' % (p, s))

# r = requests.get('https://www.douban.com/')
# print(r.text)


d = chardet.detect(b'hello world !')

print(d)

data = '哈哈'.encode('gbk')
print(type(data))
d = chardet.detect(data)

print(d)

s = data.decode('gbk')
print(type(s))
print(s)

print(psutil.cpu_times())
Пример #14
0

r3 = requests.get('https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20weather.forecast%20where%20woeid%20%3D%202151330&format=json')
print(r3.json())
print(r3.json()['query']['created'])

# POST
# r4 = requests.post('https://accounts.douban.com/login', data={'form_email': '*****@*****.**', 'form_password': '******'})
# 传递JSON数据
# params = {'key': 'value'}
# r = requests.post(url, json=params) # 内部自动序列化为JSON

# 上传文件,files参数
# upload_files = {'file': open('report.xls', 'rb')}
# >>> r = requests.post(url, files=upload_files)

# 在请求中传入Cookie,只需准备一个dict传入cookies参数:
# cs = {'token': '12345', 'status': 'working'}
# r = requests.get(url, cookies=cs)

# 要指定超时,传入以秒为单位的timeout参数:
# r = requests.get(url, timeout=2.5) # 2.5秒后超时


# chardet这个第三方库正好就派上了用场。用它来检测编码
print(chardet.detect(b'Hello, world!'))

data = '离离原上草,一岁一枯荣'.encode('utf-8')
print(data)
print(chardet.detect(data))
Пример #15
0
 def craw(self):
     start_time = time.time()
     
     domain = self.data.get_domain(self.domainpath)
     if domain is None:
         self.filelogs.writeLogs('domain is None')
         exit()
     
     self._domain_array(domain)
     
     if self.domain_id and self.domain_id is not None:
         self.filelogs.writeFile(self.domainpath, self.domain_id)
     
     self.filelogs.writeLogs('采集开始  ID:%d'%(self.domain_id))
     
     #统计连续失败的次数
     fail_count = 0
     
     #print domain
     while (time.time() - start_time) <= self.outtime:
         new_url = self.book_regular
         
         time.sleep(5)
         
         #补全链接中的http
         domian_host = self.parser.get_http_url(self.domain)
         
         self.start_id = self.data.get_book_id(self.domain_id, self.start_id)
         
         #组装book_id 的 url
         full_url = self.parser.fill_url_book_id(new_url, self.start_id, self.book_mark_id)
         
         if full_url and full_url != '':
             self.filelogs.writeLogs('下载页面开始')
             html_cont = self.downloader.download(full_url)
             self.filelogs.writeLogs('下载页面结束')
             
             if html_cont is None:
                 fail_count = fail_count + 1
             else:
         
                 if chardet.detect(html_cont)['encoding'] == 'GB2312' or chardet.detect(html_cont)['encoding'] == 'gb2312':
                     iconv_type = 'gbk'
                 else:
                     iconv_type = chardet.detect(html_cont)['encoding']
            
                 html_cont_coding =  html_cont.decode(iconv_type,'ignore').encode('utf-8')
              
              
                 #获取book_id
                 book_id = self.start_id
                 book_id = int(book_id)
                 #获取小说名称
                 book_name = self.parser.get_book_name(html_cont_coding, self.bookname_regular)
                  
                 #获取作者名称
                 author = self.parser.get_author(html_cont_coding, self.author_regular)
                  
                 #获取简介
                 descript = self.parser.get_descript(html_cont_coding, self.descript_regular)
                 #去除开通的空格
                 if descript is not None:
                     descript = self.parser.del_space(descript)
                  
                 if book_name is not None and author is not None:
                     fail_count = 0
                     #把小说信息存入数据库
                     self.data.save_book(self.domain_id, book_id, book_name, author, descript)
                 else:
                     fail_count = fail_count + 1
             
             #连续失败50次,停止采集
             if fail_count == 50:
                 self.filelogs.writeLogs('失败次数:'+str(fail_count))
                 break
             
         if (time.time() - start_time) > self.outtime:
             break
         
     self.filelogs.writeLogs('采集结束')