Exemplo n.º 1
0
def fileToList(fileName):
    file = open(fileName)
    lines = file.readlines()

    result = []
    for line in lines:
        try:
            index = re.search(pattern, line).group(0)
            replaced = re.sub(pattern, "", line)
            statement = {
                "fileName": fileName,
                "index": index,
                "text": replaced.replace("\n", "")
            }
            result.append(statement)
        except Exception as e:
            print(line)
            print("---error---", e)

    return result


result = {}
for root, dirs, files in os.walk("./books/"):
    for filename in files:
        lines = fileToList("./books/{}".format(filename))
        result[filename] = lines

save(result, "gae.json")
Exemplo n.º 2
0
                "verse": int(index.split(":")[1]),
                "text": replaced.replace("\n", "")
            }
            result.append(statement)
        except Exception as e:
            print(line)
            print("---error---", e)

    return result


result = []
for root, dirs, files in os.walk("./books/"):
    for filename in files:
        print(filename)
        lines = fileToList("./books/{}".format(filename))
        splittedFileName = split(filename, "[0-9]{1}-[0-9]{2}")
        oldAndNew = int(splittedFileName[0].split("-")[0])
        chapter = int(splittedFileName[0].split("-")[1])

        bookName = splittedFileName[1].replace(".txt", "")
        obj = {"bookName": bookName, "oldAndNew": oldAndNew, "lines": lines}
        result.append(obj)

lines = []
for ee in result:
    print(ee['bookName'])
    lines = lines + ee['lines']

save(lines, "./gaeLines.json")
Exemplo n.º 3
0
url파악 되었으면
02_parse_book_info를 먼저 봐도 됨
'''
import requests
from bs4 import BeautifulSoup
from libs.crawler import crawl
from libs.jsonFileSaver import save

url = "http://www.holybible.or.kr/BIBLE_hkjv/"
url = "http://www.holybible.or.kr/B_GAE/"


def parse(pageString):
    bsObj = BeautifulSoup(pageString, "html.parser")
    tables = bsObj.findAll("table")
    oldTable = tables[8]
    newTable = tables[11]

    oldTk3s = oldTable.findAll("td", {"class": "tk3"})
    newTk3s = newTable.findAll("td", {"class": "tk3"})

    for td in newTk3s:
        href = td.find("a")['href']
        print("{}{}".format(url, href))


result = parse(crawl(url))
print(result)

save(result, "개혁개정url.json")
Exemplo n.º 4
0
        }
        statements.append(statement)
    return statements


# for index in range(1, bookInfo['totalPage'] + 1 ):
def getBible(versionCode):
    bible = {}
    for index in range(len(bookInfos)):
        bookInfo = bookInfos[index]
        print(bookInfo)
        ci = index + bookInfo['firstCi']

        vr = "GAE"  # kjv:0 b_gae:9
        if (bookInfo['totalPage'] != 1):
            url = "http://www.holybible.or.kr/{}/cgi/bibleftxt.php?VR={}&CI={}&CV=99".format(
                versionCode, vr, ci)
        else:
            url = bookInfo['url']
        pageString = crawl(url)
        statements = parse(pageString)
        print(len(statements))
        bible[index + 1] = statements
    return bible


bible = getBible("B_GAE")
save(bible, "B_GAE_bible.json")

# 훔. 수집을. 또 해보쟈
Exemplo n.º 5
0
from libs.naver_shopping.crawler import crawl
from libs.naver_shopping.parser import parse
from libs.jsonFileSaver import save

results = []
keywords = ["숨셔바요", "탈취제", "애완동물 냄세"]

for keyword in keywords:
    pageString = crawl(keyword)
    products = parse(pageString)
    results = results + products
print(len(results))

save(results, "./products.json")
# file = open(fileName, "w+")
# file.write(json.dumps(content))