def update(self, itemlist=list(), renew=False): if (self.linklist == itemlist) and not renew: return False # print (itemlist) items = list() for iter in itemlist: item = dict() # item['link'] = item['guid'] = quote(iter) item['link'] = item['guid'] = ''.join(["<![CDATA[", iter, "]]>"]) con = http_opener.openPage(iter) page = page_parser.parse(con, encoding=self.info['encoding']) # base = con.info.getheader('Date'), # print (base) # t = page.select(feed.config['searchitems']) for key, iter in self.config.items(): # not in 'link', 'guid' if key.find('Format') != -1: # pub_date_format continue val = page.select(iter)[0] if key.find('desc') != -1: # desc # print(val.prettify(formatter="html")) item[key] = ''.join([ "<![CDATA[", val.prettify(formatter="minimal"), "]]>" ]) elif key.find('pub_date') == -1: # other item[key] = "<![CDATA[" + val.get_text() + "]]>" else: # pub_date rawtime = val.getText().encode('utf8') length = datetime.now().strftime( self.config['pub_date_format']) time1 = datetime.strptime( (rawtime[0:len(length)] + b" +0900").decode(), self.config['pub_date_format'] + ' %z') # print("1",time1.utctimetuple()) # print("2", item[key]) # print(type(time1), type(time1.utctimetuple())) # print("=", time.strftime("%a, %d %b %Y, %H:%M:%S", time1.utctimetuple())) item[key] = time.strftime("%a, %d %b %Y, %H:%M:%S", time1.utctimetuple()) items.append(item) self[:] = items # print(self.config) # print(len(self)) self.linklist = itemlist return True
def getFilelist(mode=None, data=str(), parseRoot=str(), rootAttr=dict(), element=str(), attr=dict(), encode='utf8'): # type: (Union(str, None), str, str, dict, str, dict, Union(str, None)) -> List[(str, List)] # todo: 함수명을 변경한다 # todo: result로 [[title:str(), element:list()],...]의 list를 반환하도록 한다.(title을 추가 반환->folder 이름으로 삼을 수 있도록) logger.debug("-* length of data: {}".format(len(data))) if encode is not None: data = data.encode(encode) page = page_parser.parse(data) root1 = parseRoot rootAttr1 = rootAttr element1 = element attr1 = attr if mode is None: if page.page.rss is not None: mode = "rss" else: mode = "html" if mode == "rss": root1 = '' rootAttr1 = dict() element1 = "item" attr1 = dict() logger.debug(root1, rootAttr1, element1, attr1) results = page.findElementsExt(root1, rootAttr1, element1, attr1) logger.debug("-* length of parse: {}".format(len(results))) # print results if mode == "rss": # hp = HTMLParser() targets = results results = list() for iter in targets: logger.debug("-* string of description : {}".format( iter.description.string)) # data = hp.unescape(iter.description.string) data = unescape(iter.description.string) result = getFilelist("item", data, parseRoot, rootAttr, element, attr, encode=None) results.extend(result) pass logger.debug("-* type of result : ({})[0]{}".format( len(results), type(results[0]))) return results
def get_pageitems(rule, maxpage=1, encoding='utf-8'): list_url = rule.listUrl item_list = list() page_count = 0 # end loop : 1 < 1 -> False while page_count < maxpage: logger.debug("*- loop {}/{} - encoding: ({}) {}".format( page_count + 1, maxpage, len(rule.encoding), rule.encoding)) # _page = openPage(listurl).decode(encoding, 'ignore') _page = openPage(list_url) # bytes if encoding != 'utf-8': _t1 = _page.rfind(b"<meta", 0, _page.find(b"charset")) _t2 = _page.find(b'>', _t1) + 1 page = _page[:_t1] + _page[_t2:] else: page = _page # item_list 가 초기화되지 않음 # logger.info("*- list(encoding:{encoding}, listpage: {list_len}(|{listpage}|), item:{item_len})" # .format_map({'encoding': encoding, 'list_len': len(_page), 'listpage': _page[:13].decode(encoding), # 'item_len': len(item_list)})) logger.info( "*- list(encoding:{encoding}, listpage: {list_len}(|{listpage}|))". format_map({ 'encoding': encoding, 'list_len': len(_page), 'listpage': _page[:13].decode(encoding), })) listpage = page_parser.parse(page) items = listpage.select(str(rule.itemlink)) item_list.extend([iter for iter in items if iter not in item_list]) logger.info( "*- item {item_len}, (listpage loaded: {listpage}, get_itemlink: {getlink})" .format_map({ 'count': page_count + 1, 'listpage': len(str(listpage.page)) > 0, 'getlink': len(items) > 0, 'item_len': len(item_list) })) nextpages = listpage.select(str(rule.nextpagelink)) logger.debug("*- nextpage:{} - {}({})".format(str(rule.nextpagelink), (len(nextpages) > 0), nextpages)) if len(nextpages): # relative url -> absolute url _listurl = get_absolute_anchor_reference(str(rule.listUrl), nextpages[0]) if list_url == _listurl: break list_url = _listurl else: break page_count = page_count + 1 return item_list, listpage
def update(feed): """ 이미 등록된 feed에 대하여 다음을 수행한다. 1. feed에 대하여 data를 재생성한다. # page에 대한 connection 생성 # 생성된 connection에 대해 cookie 획득 # page parse하여 list 획득 # list따라 page pare하여 feeditem 획득 """ # 서버 배포 갱신 시간 - 인스턴스 갱신 메소드 실행시 자동 변경, 여기서 넣지 말 것. # 아직 회원 인증 로그인은 제공하지 않음 http_opener.makeOpenerWithCookie(feed.config['link'], {}) listpage = page_parser.parse(http_opener.openPage(feed.config['link'])) t = listpage.select(feed.config['searchitems']) itemlist = [ http_opener.make_absolute_uri(feed.config['link'], iter.find('a').attrs['href']) for iter in t ] # print (itemlist) feed.feeditems.info['encoding'] = listpage.original_encoding param = [itemlist] if not 'lastBuildDate' in feed.info: param.append(True) if feedupdate(feed.feeditems, param): feed.info['lastBuildDate'] = time.strftime('%a, %d %b %Y, %H:%M:%S', time.gmtime()) feed.info['pub_date'] = time.strftime('%a, %d %b %Y, %H:%M:%S', time.gmtime()), pass
def update_feeditem(feedid, iter, rset): con = openPage(iter) if not (rset.encoding) or len(rset.encoding) == 0: encoding = 'utf-8' else: encoding = str(rset.encoding) page = page_parser.parse(con, encoding=str(encoding)) logger.debug("*- page<{encoding}> : ({pagelength}){itempage}".format_map({ 'encoding': encoding, 'pagelength': len(con), 'itempage': con[:14].decode(encoding) })) # todo: tbody가 inspector에는 나오지만 tbody가 없는 경우도 있다는 점을 주의하라고 쓰자. # todo : 참조 https://hexfox.com/p/having-trouble-extracting-the-tbody-element-during-my-web-scrape/ try: title = make_CDATA(page.select(rset.itemtitle)[0].get_text()) logger.debug("*- title: <{}> {}".format(str(rset.itemtitle), title)) link = make_CDATA(iter) logger.debug("*- link: {}".format(link)) desc = make_CDATA( page.select(rset.itemdescription)[0].prettify(formatter="minimal")) logger.debug("*- desc: <{}>({}) {}".format(rset.itemdescription, type(desc), len(desc))) except IndexError: return False feeditem = Item( feedid=int(feedid), # int title=title, # str link=link, # str # description = desc.encode(), # bytes description=desc, ) # followed is optional : # StrField에 대해, 없거나 빈 문자열이면 len(<object>) -> 0 # StrField에 대해, 없으면 not(<object>) -> True def _add(item, fieldname: str, field: str, parse=True, usingCDATA=True): logger.debug("-*|{fieldname} : ({len}){field}".format_map({ 'fieldname': fieldname, 'len': len(field), 'field': field })) if field and (len(field) > 0): logger.debug( "-*|{fieldname} not Null(None|zero length)".format_map( {'fieldname': fieldname})) if parse: value = page.select(field)[0].get_text() else: value = field # print(fieldname,len(value),sep=':') if usingCDATA: setattr(item, fieldname, make_CDATA(value)) else: setattr(item, fieldname, value) return True return False _add(feeditem, "author", rset.itemauthor) _add(feeditem, "category", rset.itemcategory) # guid = page.select(rset.itemguid)[0].get_text() if not (rset.itemguidtype) or len(rset.itemguidtype) == 0: logger.debug("*- guid is not setted.") guidtype = None else: guidtype = str(rset.itemguidtype) if guidtype == "find": _add(feeditem, "guid", rset.itemguid, parse=True, usingCDATA=False) elif guidtype == "regular": if rset.itemguidfrom == "item": guid = regsub(iter, *str(rset.itemguid).split("\\!\\")) elif rset.itemguidfrom == "list": guid = regsub( Ruleset.return_item(feedid).listurl, *rset.itemguid.split("\\!\\")) else: guid = None _add(feeditem, "guid", guid, parse=False, usingCDATA=True) else: # guidtype = None pass # comment = make_CDATA(rset.itemcomment) # _add(i, "comment", rset.itemcomment, parse = False, usingCDATA = True) # if if not (not (rset.itempub_date)) and len(rset.itempub_date) > 0: # 웹 서비스상에서 날짜 형식 지정시 예시를 표현해주도록 할 것 _rawtime = page.select(str( rset.itempub_date))[0].get_text().encode().strip() _length = datetime.now().strftime(str( rset.itempub_date_format)).encode() _converted = datetime.strptime( (_rawtime[0:len(_length)] + b" +0900").decode(), str(rset.itempub_date_format) + ' %z') pub_date = time.strftime('%a, %d %b %Y %H:%M:%S GMT', _converted.utctimetuple()) _add(feeditem, "pub_date", pub_date, parse=False, usingCDATA=False) return feeditem