def verity_http(self): self.port() http = [] http_ok = [200,302,404,500] f= open('/root/project/info/port/portscan.txt','rb') f_http = open('/root/project/info/port/http.txt','wb') f_no_http = open('/root/project/info/port/no_http.txt', 'wb') f_domain = open('/root/project/info/domain_all.txt','rb') f_ip = open('/root/project/info/domain_all.txt', 'rb') for line in f: line = line.strip() http.append(line) for line in f_domain: line = line.strip() http.append(line) for line in f_ip: line = line.strip() http.append(line) for line in http: if '443' in line: url = 'https://'+line+'/' else: url = 'http://'+line+'/' try: res = req.get(url) if res.status_code: print "{} http is ok".format(url) self.http.append(url) f_http.write(url +'\n') except Exception as e: print "sorry,this not http" f_no_http.write(line +'\n')
def run(av_id): # 获取视频页面的html AV_URL = r"https://www.bilibili.com/video/av{}" av_url = AV_URL.format(av_id) res = get(av_url) # HTML # 1 提取info和cid (title, class1, class2, time, rank, uid) = getInfo(res) if (title == None): print("Title NOT FOUND") return cid = getCid(res) # 获取api页面的html # 2 获取总播放量、历史累计弹幕数、回复数、收藏数、硬币数、分享数、现在排名、历史最高排名、喜欢数、不喜欢数、版数、版权 A_URL = r"https://api.bilibili.com/archive_stat/stat?aid={}" a_url = A_URL.format(av_id) data = get(a_url, decode=False).json()["data"] # 获取comment COMMENT_URL = "https://comment.bilibili.com/{}.xml" comment_url = COMMENT_URL.format(cid) res_d = get(comment_url, decode=False).content # 弹幕的HTML # 3 获取弹幕 d = getDannmaku(res_d) # 4 保存 name = "{}".format(av_id) s = { "title": title, "class1": class1, "class2": class2, "time": time, "rank": rank, "uid": uid, "data": data, "av_id": av_id, "cid": cid, "d": d } save(name, s) print("{} Save Successful! ".format(av_id))
import cx_Oracle as oci from _operator import sub # 구현 순서 # 1. 목록 페이지를 접속한다. # 2. 여행지 링크 주소를 찾는다. # x 반복 # 3. 2번의 페이지를 접속한다. # 4. 크롤링 텍스트 or 이미지를 가져온다. # 5. 4번의 데이터를 데이터베이스에 추가한다. domain = 'http://info.hanatour.com' # http://info.hanatour.com/dest/list/all/1?page=3 total = 'http://info.hanatour.com/dest/list/all/1' totalHtml = get(total) totalSoup = BeautifulSoup(totalHtml, 'html.parser') print(len(totalSoup.select('.listArea a'))) # DB 접속 import cx_Oracle as oci # cx_Oracle에서 한글 인코딩 처리 os.environ["NLS_LANG"] = ".AL32UTF8" START_VALUE = u"Unicode \u3042 3".encode('utf-8') END_VALUE = u"Unicode \u3042 6".encode('utf-8')
# ex13.py from myrequest import get from bs4 import BeautifulSoup html = get('http://naver.com') soup = BeautifulSoup(html, 'html.parser') result = soup.find_all('a') print(type(result)) print(len(result)) for link in result: print(link.get_text(), '-', link.get('href'))
# ex15.py # http://weather.naver.com # import requests from myrequest import get # 페이지 소스 긁어오기 from bs4 import BeautifulSoup # 소스 분석하기 url = 'https://weather.naver.com/period/weeklyFcast.nhn' html = get(url) # print(html) soup = BeautifulSoup(html, 'html.parser') # 탐색 방법 1 - 태그 단일 요소 검색 h5 = soup.find('h5') print(h5) h6 = soup.find('h6') print(h6) strong = soup.find('strong') print(strong) dd = soup.find('dd') print(dd) td = soup.find('td') print(td)
# ex12.py from myrequest import get from bs4 import BeautifulSoup html = get('http://www.example.com/') # 소스 분석 + 나누는 작업 > 파싱(Parsing) soup = BeautifulSoup(html, 'html.parser') print(type(soup)) result = soup.find("h1") print(type(result)) print(result) print(result.get_text())
# ex23.py # 정적 데이터 크롤링 from myrequest import get from bs4 import BeautifulSoup # 마우스 오른쪽 버튼 > 페이지 소스 보기 html = get('http://211.63.89.31:8088/python/data.do') soup = BeautifulSoup(html, 'html.parser') staticdata = soup.select('.staticdata') # <li> x 4개 print(len(staticdata)) for sdata in staticdata: print(sdata.get_text()) print(soup.select('#name')[0].get_text()) print(soup.select('#age')[0].get_text()) print(soup.select('#address')[0].get_text()) print(soup.select('#gender')[0].get_text()) print('-----------------------') # 동적 데이터 크롤링 dynamicdata = soup.select('.dynamicdata') print(len(dynamicdata)) for ddata in dynamicdata: print('data : ', ddata.get_text())