def runDailySam(qtid): url = "http://www.365qt.com/TodaysQT.asp?QTID={}".format(qtid) pageString = crawl(url) result = parse(pageString) addr = result['addr'] print(result['date'], addr) statements = findBetween(addr['book'], addr['chapter'], addr['verseFrom'], addr['verseTo']) for statement in statements: print(statement['index'], statement['text']) print(result['content']) print(result['bx2']) print(result['date'])
def getBible(versionCode): bible = {} for index in range(len(bookInfos)): bookInfo = bookInfos[index] print(bookInfo) ci = index + bookInfo['firstCi'] vr = "GAE" # kjv:0 b_gae:9 if (bookInfo['totalPage'] != 1): url = "http://www.holybible.or.kr/{}/cgi/bibleftxt.php?VR={}&CI={}&CV=99".format( versionCode, vr, ci) else: url = bookInfo['url'] pageString = crawl(url) statements = parse(pageString) print(len(statements)) bible[index + 1] = statements return bible
def getBookInfo(versionCode): bookInfo = [] for vl in range(1, 67): url = "http://www.holybible.or.kr/{}/cgi/bibleftxt.php?VR=0&VL={}&CN=1&CV=99&FR=".format(versionCode, vl) pageString = crawl(url) try: # kjv는 7, b_gae는 8 result = parse(pageString, 8) result['versionCode'] = versionCode if(result['totalPage'] != 1): pass else: result['url'] = url bookInfo.append(result) print(result) except Exception as e: print("--error--", e) return bookInfo
from libs.crawler import crawl from bs4 import BeautifulSoup def getProducts(string): bsObj = BeautifulSoup(string, "html.parser") ul = bsObj.find("ul", {"id": "productList"}) lis = ul.findAll("li") li = lis[1] print(li) return [] url = "https://www.coupang.com/np/categories/186764?page=1" pageString = crawl(url) print(getProducts(pageString))
from sites.unipass import parse from libs.crawler import crawl url = "https://unipass.customs.go.kr/csp/index.do" result = crawl(url) parse(result)
import requests from bs4 import BeautifulSoup from libs.crawler import crawl url = "http://www.holybible.or.kr/BIBLE_hkjv/" def parse(pageString): bsObj = BeautifulSoup(pageString, "html.parser") tables = bsObj.findAll("table") oldTable = tables[8] newTable = tables[11] print(newTable) oldTk3s = oldTable.findAll("td", {"class": "tk3"}) newTk3s = newTable.findAll("td", {"class": "tk3"}) kjvUrl = "http://www.holybible.or.kr/BIBLE_hkjv/" for td in newTk3s: href = td.find("a")['href'] print("{}{}".format(kjvUrl, href)) result = parse(crawl(url))
for index in range(len(lis)): text = lis[index].text textSplitted = text.split("\n")[0] statement = { "bookName": bookName, "idx": index + 1, "text": textSplitted } statements.append(statement) return statements # for index in range(1, bookInfo['totalPage'] + 1 ): bible = {} for index in range(len(bookInfos)): bookInfo = bookInfos[index] print(bookInfo) ci = index + bookInfo['firstCi'] url = "http://www.holybible.or.kr/BIBLE_hkjv/cgi/bibleftxt.php?VR=0&CI={}&CV=99&FR=H".format( ci) statements = parse(crawl(url)) print(len(statements)) bible[index + 1] = statements print(bible) # save(bible, "bible.json") # 훔. 수집을. 또 해보쟈
import json from libs.crawler import crawl from libs.instagram.specificPageParser import parse file = open("../dangstagram_urls") jsonObj = json.loads(file.read()) print(len(jsonObj)) datetimeParsedInfo = [] for info in jsonObj[0:1000]: try: pageString = crawl(info['key']) datetime = parse(pageString) info['datetime'] = datetime datetimeParsedInfo.append(info) except Exception as e: print(e) print(datetimeParsedInfo) file = open("./0to1000.json", "w+") file.write(json.dumps(datetimeParsedInfo))
def getPageUrls(pageNo): url = "https://www.coupang.com/np/categories/194690?page={}".format(pageNo) pageString = crawl(url) urls = parse(pageString) return urls
# chapter01_crawl.py # requests 로 크롤링하는 부분을 모듈화 # import 하여 사용하는 코드 from libs.crawler import crawl # 수집하고 싶은 인스타그램의 #해시태그 페이지 URL 주소 url = 'https://www.instagram.com/explore/tags/%EC%95%84%EC%9D%B4%EC%A6%88%EC%9B%90/' page_string = crawl(url) print(page_string)
def printHello(url, num): pageString = crawl(url) file = open("./pages2/{}.html".format(num), "w+", encoding="euc-kr") file.write(str(pageString)) file.close()
def getSubPageUrls(pageNum): url = "https://www.coupang.com/np/categories/186764?page={}".format( pageNum) pageString = crawl(url) subPageUrls = parse(pageString) return subPageUrls
from libs.crawler import crawl from bs4 import BeautifulSoup url = "https://search.shopping.naver.com/search/all.nhn?query=%EC%88%A8%EC%85%94%EB%B0%94%EC%9A%94&cat_id=&frm=NVSHATC" string = crawl(url) # print(string) bsObj = BeautifulSoup(string, "html.parser") print(bsObj) goodsList = bsObj.find("ul", {"class": "goods_list"}) print(goodsList)
# 지금까지는 한페이지에 모든 것을 만들었다면 지금부터는 java처럼 불러서 하는 걸 만들어 볼거임. # ** Chapter01_crawl.py # * requests에서 크롤링하는 부분을 모듈화 하고, # * import해서 사용하는 용도 # 수집하고 싶은 인스타그램의 #해시태그 페이지 url 주소 from libs.crawler import crawl # libs 패키지의 crawler클래스로부터 crawl()메서드를 호출함. url = 'https://www.instagram.com/explore/tags/%EA%B3%A0%EC%96%91%EC%9D%B4/' # explore/tags/고양이 / pageString = crawl( url) # requests를 이제부터 crawl()이 하도록 만들 것임. 이 메서드를 crawler 클래스로 보내서 쓸거임 print(pageString)