示例#1
0
def parse(pageString):
    result = {}
    bsObj = BeautifulSoup(pageString, "html.parser")
    # print(bsObj)

    qtDayText2 = bsObj.find("div", {"id": "qtDay"})
    try:
        extDat = findMatchedTexts(qtDayText2.text, "201[\s\S]+")
        ss = extDat[0].split("\r\n        ")
        ss2 = "{} {}".format(ss[0].replace("\n", ". "), ss[1])
        result['date'] = ss2
        res = findMatchedTexts(qtDayText2.text, "\(.+\)")
        result['addr'] = getAddr(res[0])
    except Exception as e:
        print(e)

    box2Content = bsObj.find("div", {"class": "box2Content"})
    result['box2Content'] = box2Content.text

    # result['srcipt'] = script.text

    content = bsObj.find("div", {"id": "content"})
    ps = content.findAll("p")
    result['content'] = ps[4].text

    bx2 = bsObj.find("div", {"class": "bx2"})

    guideText = bx2.text
    result['bx2'] = addLine(guideText)

    return result
示例#2
0
def get_row(tr):
    tds = tr.find_all('td')
    atag = str(tds[0].find('a')).split('<span class="tit_info">')

    first = ''
    try:
        first = re.compile('\t.*\t').sub('', atag[0]).split('\n')[1]
        first = first.replace('R&amp;amp;amp;amp;D ', '')
    except:
        print('----------')

    second = ''
    try:
        second = atag[1].split('</span>')[0]
        second = second.replace('R&amp;amp;D ', '')
    except:
        print('---------')
    # print(tds[1], tds[2], tds[3])

    api_id = ''
    try:
        id_a = tds[0].find('h4').find('a')['href']
        api_id = findMatchedTexts(id_a, "javascript:view\('[0-9]+")[0]
        api_id = api_id.replace("javascript:view('", "")
    except Exception as e:
        print('----api id exception -----')

    service_types = []
    try:
        service_types_spans = tds[5].find('div', {
            'class': 'datatype'
        }).find_all('span')
        service_types = [span.text for span in service_types_spans]

    except Exception as e:
        print('----- serivce types exception -------')

    return {
        'api_id': api_id,
        'title': first,
        'subtitle': second,
        'count': tds[3].text,
        'service_types': service_types
    }
from libs.crawler import crawl
from bs4 import BeautifulSoup
from libs.patternMatcher import findMatchedTexts

url = "http://dart.fss.or.kr/corp/searchAutoComplete.do?textCrpNm=%EC%85%80%ED%8A%B8%EB%A6%AC%EC%98%A8&_=1561171426973"

pageString = crawl(url)

bsObj = BeautifulSoup(pageString, "html.parser")

names = findMatchedTexts(bsObj.text, "셀트리온[가-힣0-9a-zA-z]*")

print(names)

for name in names:
    print(name)