Exemplo n.º 1
0
def _gnewsItem2Page(item):
    pageItem = {}
    pageItem["title"] = htmlutil.getTextContent(item.get("title"))
    pageItem["url"] = item.get("unescapedUrl")
    pageItem["content"] = htmlutil.getTextContent(item.get("content"))
    pageItem["publisher"] = item.get("publisher")
    pageItem["published"] = dateutil.jsDate2utc14(item.get("publishedDate"))
    if item.get("image"):
        img = {}
        img["url"] = item["image"].get("url")
        img["width"] = item["image"].get("tbWidth")
        img["height"] = item["image"].get("tbHeight")
        pageItem["img"] = img
    return pageItem
Exemplo n.º 2
0
def getItemFromScript(element):
    if element.tag != 'a':
        return None
    url = element.get('href')
    title = lxmlutil.getScriptConstantString(element)
    if title:
        title = htmlutil.getTextContent(title)
    item = {}
    if url:
        item['url'] = url
    if title:
        item['title'] = title
    return item
Exemplo n.º 3
0
def getItemFromScript(element):
    if element.tag != "a":
        return None
    url = element.get("href")
    title = lxmlutil.getScriptConstantString(element)
    if title:
        title = htmlutil.getTextContent(title)
    item = {}
    if url:
        item["url"] = url
    if title:
        item["title"] = title
    return item
Exemplo n.º 4
0
def _googleItem2page(item):
    pageItem = {}
    pageItem['title'] = htmlutil.getTextContent(item.get('title'))
    pageItem['url'] = item.get('unescapedUrl')
    pageItem['content'] = htmlutil.getTextContent(item.get('content'))
    return pageItem