예제 #1
0
    def update(self, itemlist=list(), renew=False):
        if (self.linklist == itemlist) and not renew:
            return False

        # print (itemlist)
        items = list()

        for iter in itemlist:
            item = dict()

            # item['link'] = item['guid'] = quote(iter)
            item['link'] = item['guid'] = ''.join(["<![CDATA[", iter, "]]>"])
            con = http_opener.openPage(iter)
            page = page_parser.parse(con, encoding=self.info['encoding'])

            # base = con.info.getheader('Date'),
            # print (base)

            # t =  page.select(feed.config['searchitems'])

            for key, iter in self.config.items():  # not in 'link', 'guid'
                if key.find('Format') != -1:  # pub_date_format
                    continue

                val = page.select(iter)[0]

                if key.find('desc') != -1:  # desc
                    # print(val.prettify(formatter="html"))
                    item[key] = ''.join([
                        "<![CDATA[",
                        val.prettify(formatter="minimal"), "]]>"
                    ])

                elif key.find('pub_date') == -1:  # other
                    item[key] = "<![CDATA[" + val.get_text() + "]]>"

                else:  # pub_date
                    rawtime = val.getText().encode('utf8')
                    length = datetime.now().strftime(
                        self.config['pub_date_format'])
                    time1 = datetime.strptime(
                        (rawtime[0:len(length)] + b" +0900").decode(),
                        self.config['pub_date_format'] + ' %z')

                    # print("1",time1.utctimetuple())
                    # print("2", item[key])
                    # print(type(time1), type(time1.utctimetuple()))
                    # print("=", time.strftime("%a, %d %b %Y, %H:%M:%S", time1.utctimetuple()))

                    item[key] = time.strftime("%a, %d %b %Y, %H:%M:%S",
                                              time1.utctimetuple())

            items.append(item)

        self[:] = items
        # print(self.config)
        # print(len(self))

        self.linklist = itemlist
        return True
예제 #2
0
def getFilelist(mode=None,
                data=str(),
                parseRoot=str(),
                rootAttr=dict(),
                element=str(),
                attr=dict(),
                encode='utf8'):
    # type: (Union(str, None), str, str, dict, str, dict, Union(str, None)) -> List[(str, List)]
    # todo: 함수명을 변경한다
    # todo: result로 [[title:str(), element:list()],...]의 list를 반환하도록 한다.(title을 추가 반환->folder 이름으로 삼을 수 있도록)

    logger.debug("-* length of data: {}".format(len(data)))
    if encode is not None:
        data = data.encode(encode)

    page = page_parser.parse(data)
    root1 = parseRoot
    rootAttr1 = rootAttr
    element1 = element
    attr1 = attr

    if mode is None:
        if page.page.rss is not None:
            mode = "rss"
        else:
            mode = "html"

    if mode == "rss":
        root1 = ''
        rootAttr1 = dict()
        element1 = "item"
        attr1 = dict()

    logger.debug(root1, rootAttr1, element1, attr1)
    results = page.findElementsExt(root1, rootAttr1, element1, attr1)
    logger.debug("-* length of parse: {}".format(len(results)))
    # print results

    if mode == "rss":
        # hp = HTMLParser()
        targets = results
        results = list()
        for iter in targets:
            logger.debug("-* string of description : {}".format(
                iter.description.string))
            # data = hp.unescape(iter.description.string)
            data = unescape(iter.description.string)
            result = getFilelist("item",
                                 data,
                                 parseRoot,
                                 rootAttr,
                                 element,
                                 attr,
                                 encode=None)
            results.extend(result)
        pass
    logger.debug("-* type of result : ({})[0]{}".format(
        len(results), type(results[0])))

    return results
예제 #3
0
def get_pageitems(rule, maxpage=1, encoding='utf-8'):
    list_url = rule.listUrl
    item_list = list()
    page_count = 0

    # end loop : 1 < 1 -> False
    while page_count < maxpage:
        logger.debug("*- loop {}/{} - encoding: ({}) {}".format(
            page_count + 1, maxpage, len(rule.encoding), rule.encoding))

        # _page = openPage(listurl).decode(encoding, 'ignore')
        _page = openPage(list_url)  # bytes
        if encoding != 'utf-8':
            _t1 = _page.rfind(b"<meta", 0, _page.find(b"charset"))
            _t2 = _page.find(b'>', _t1) + 1
            page = _page[:_t1] + _page[_t2:]
        else:
            page = _page
        # item_list 가 초기화되지 않음
        # logger.info("*- list(encoding:{encoding}, listpage: {list_len}(|{listpage}|), item:{item_len})"
        #             .format_map({'encoding': encoding, 'list_len': len(_page), 'listpage': _page[:13].decode(encoding),
        #                          'item_len': len(item_list)}))
        logger.info(
            "*- list(encoding:{encoding}, listpage: {list_len}(|{listpage}|))".
            format_map({
                'encoding': encoding,
                'list_len': len(_page),
                'listpage': _page[:13].decode(encoding),
            }))

        listpage = page_parser.parse(page)
        items = listpage.select(str(rule.itemlink))
        item_list.extend([iter for iter in items if iter not in item_list])

        logger.info(
            "*- item {item_len}, (listpage loaded: {listpage}, get_itemlink: {getlink})"
            .format_map({
                'count': page_count + 1,
                'listpage': len(str(listpage.page)) > 0,
                'getlink': len(items) > 0,
                'item_len': len(item_list)
            }))

        nextpages = listpage.select(str(rule.nextpagelink))
        logger.debug("*- nextpage:{} - {}({})".format(str(rule.nextpagelink),
                                                      (len(nextpages) > 0),
                                                      nextpages))
        if len(nextpages):  # relative url -> absolute url
            _listurl = get_absolute_anchor_reference(str(rule.listUrl),
                                                     nextpages[0])
            if list_url == _listurl: break
            list_url = _listurl
        else:
            break

        page_count = page_count + 1
    return item_list, listpage
예제 #4
0
def update(feed):
    """
    이미 등록된 feed에 대하여 다음을 수행한다.
    1. feed에 대하여 data를 재생성한다.

    # page에 대한 connection 생성
    # 생성된 connection에 대해 cookie 획득
    # page parse하여 list 획득
    # list따라 page pare하여 feeditem 획득

    """
    # 서버 배포 갱신 시간 - 인스턴스 갱신 메소드 실행시 자동 변경, 여기서 넣지 말 것.

    # 아직 회원 인증 로그인은 제공하지 않음
    http_opener.makeOpenerWithCookie(feed.config['link'], {})

    listpage = page_parser.parse(http_opener.openPage(feed.config['link']))
    t = listpage.select(feed.config['searchitems'])

    itemlist = [
        http_opener.make_absolute_uri(feed.config['link'],
                                      iter.find('a').attrs['href'])
        for iter in t
    ]
    # print (itemlist)

    feed.feeditems.info['encoding'] = listpage.original_encoding

    param = [itemlist]
    if not 'lastBuildDate' in feed.info: param.append(True)
    if feedupdate(feed.feeditems, param):
        feed.info['lastBuildDate'] = time.strftime('%a, %d %b %Y, %H:%M:%S',
                                                   time.gmtime())

    feed.info['pub_date'] = time.strftime('%a, %d %b %Y, %H:%M:%S',
                                          time.gmtime()),

    pass
예제 #5
0
def update_feeditem(feedid, iter, rset):
    con = openPage(iter)

    if not (rset.encoding) or len(rset.encoding) == 0:
        encoding = 'utf-8'
    else:
        encoding = str(rset.encoding)

    page = page_parser.parse(con, encoding=str(encoding))
    logger.debug("*- page<{encoding}> : ({pagelength}){itempage}".format_map({
        'encoding':
        encoding,
        'pagelength':
        len(con),
        'itempage':
        con[:14].decode(encoding)
    }))
    # todo: tbody가 inspector에는 나오지만 tbody가 없는 경우도 있다는 점을 주의하라고 쓰자.
    # todo : 참조 https://hexfox.com/p/having-trouble-extracting-the-tbody-element-during-my-web-scrape/

    try:
        title = make_CDATA(page.select(rset.itemtitle)[0].get_text())
        logger.debug("*- title: <{}> {}".format(str(rset.itemtitle), title))
        link = make_CDATA(iter)
        logger.debug("*- link: {}".format(link))
        desc = make_CDATA(
            page.select(rset.itemdescription)[0].prettify(formatter="minimal"))
        logger.debug("*- desc: <{}>({}) {}".format(rset.itemdescription,
                                                   type(desc), len(desc)))
    except IndexError:
        return False

    feeditem = Item(
        feedid=int(feedid),  # int
        title=title,  # str
        link=link,  # str
        # description = desc.encode(),  # bytes
        description=desc,
    )

    # followed is optional :
    # StrField에 대해, 없거나 빈 문자열이면 len(<object>) -> 0
    # StrField에 대해, 없으면 not(<object>) -> True
    def _add(item, fieldname: str, field: str, parse=True, usingCDATA=True):
        logger.debug("-*|{fieldname} : ({len}){field}".format_map({
            'fieldname':
            fieldname,
            'len':
            len(field),
            'field':
            field
        }))
        if field and (len(field) > 0):
            logger.debug(
                "-*|{fieldname} not Null(None|zero length)".format_map(
                    {'fieldname': fieldname}))
            if parse:
                value = page.select(field)[0].get_text()
            else:
                value = field

            # print(fieldname,len(value),sep=':')
            if usingCDATA:
                setattr(item, fieldname, make_CDATA(value))
            else:
                setattr(item, fieldname, value)

            return True
        return False

    _add(feeditem, "author", rset.itemauthor)
    _add(feeditem, "category", rset.itemcategory)
    # guid = page.select(rset.itemguid)[0].get_text()

    if not (rset.itemguidtype) or len(rset.itemguidtype) == 0:
        logger.debug("*- guid is not setted.")
        guidtype = None
    else:
        guidtype = str(rset.itemguidtype)

    if guidtype == "find":
        _add(feeditem, "guid", rset.itemguid, parse=True, usingCDATA=False)
    elif guidtype == "regular":
        if rset.itemguidfrom == "item":
            guid = regsub(iter, *str(rset.itemguid).split("\\!\\"))
        elif rset.itemguidfrom == "list":
            guid = regsub(
                Ruleset.return_item(feedid).listurl,
                *rset.itemguid.split("\\!\\"))
        else:
            guid = None
        _add(feeditem, "guid", guid, parse=False, usingCDATA=True)
    else:  # guidtype = None
        pass

    # comment = make_CDATA(rset.itemcomment)
    # _add(i, "comment", rset.itemcomment, parse = False, usingCDATA = True)
    # if
    if not (not (rset.itempub_date)) and len(rset.itempub_date) > 0:
        # 웹 서비스상에서 날짜 형식 지정시 예시를 표현해주도록 할 것
        _rawtime = page.select(str(
            rset.itempub_date))[0].get_text().encode().strip()
        _length = datetime.now().strftime(str(
            rset.itempub_date_format)).encode()
        _converted = datetime.strptime(
            (_rawtime[0:len(_length)] + b" +0900").decode(),
            str(rset.itempub_date_format) + ' %z')
        pub_date = time.strftime('%a, %d %b %Y %H:%M:%S GMT',
                                 _converted.utctimetuple())
        _add(feeditem, "pub_date", pub_date, parse=False, usingCDATA=False)

    return feeditem