예제 #1
0
파일: spider.py 프로젝트: aviarypl/venus
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', 
            feedparser.FeedParserDict({'status':'500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri,unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers["user-agent"] = "Venus (+%s)" % config.link()

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content) 
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d",
                uri, thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.gaierror, e:
            log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
예제 #2
0
파일: spider.py 프로젝트: aviarypl/venus
def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers["user-agent"] = "Venus (+%s)" % config.link()

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d", uri,
                      thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.gaierror, e:
            log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]),
                      thread_index)
예제 #3
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*")
           if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items = max([
        config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']
    ])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet', planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data = feedparser.parse(filename(sources, sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue
        xdoc = minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    new_feed_items = config.new_feed_items()
    for mtime, file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry = minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id, 0) + 1
                    if new_feed_items and count[id] > new_feed_items: continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids: continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc
예제 #4
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items=max([config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())    
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.pubsubhubbub_hub():
        hub = doc.createElement('link')
        hub.setAttribute('rel', 'hub')
        hub.setAttribute('href', config.pubsubhubbub_hub())
        feed.appendChild(hub)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet',planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue

        # warn on missing links
        if not data.feed.has_key('planet_message'):
            if not data.feed.has_key('links'): data.feed['links'] = []

            for link in data.feed.links:
              if link.rel == 'self': break
            else:
              log.debug('missing self link for ' + sub)

            for link in data.feed.links:
              if link.rel == 'alternate' and 'html' in link.type: break
            else:
              log.debug('missing html link for ' + sub)

        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    atomNS='http://www.w3.org/2005/Atom'
    new_feed_items = config.new_feed_items()
    for mtime,file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry=minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagNameNS(atomNS, 'source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id,0) + 1
                    if new_feed_items and count[id] > new_feed_items: continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids: continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids:
                          log.warn('Skipping: ' + id)
                        if id not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc
예제 #5
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.logger

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir = [(os.stat(file).st_mtime, file) for file in glob.glob(cache + "/*")
           if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items = max([
        config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']
    ])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.pubsubhubbub_hub():
        hub = doc.createElement('link')
        hub.setAttribute('rel', 'hub')
        hub.setAttribute('href', config.pubsubhubbub_hub())
        feed.appendChild(hub)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet', planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data = feedparser.parse(filename(sources, sub))
        if data.feed.has_key('id'):
            sub_ids.append(data.feed.id)
        if not data.feed:
            continue

        # warn on missing links
        if not data.feed.has_key('planet_message'):
            if not data.feed.has_key('links'):
                data.feed['links'] = []

            for link in data.feed.links:
                if link.rel == 'self':
                    break
            else:
                log.debug('missing self link for ' + sub)

            for link in data.feed.links:
                if link.rel == 'alternate' and 'html' in link.type:
                    break
            else:
                log.debug('missing html link for ' + sub)

        xdoc = minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    count = {}
    atomNS = 'http://www.w3.org/2005/Atom'
    new_feed_items = config.new_feed_items()

    posted_urls = set()
    if config.post_to_twitter():
        if os.path.exists(posted_urls_file):
            try:
                with open(posted_urls_file, 'rb') as f:
                    posted_urls = pickle.load(f)
            except Exception as ex:
                log.error("Error reading posted_urls %s", ex)
#    print(posted_urls)

    for mtime, file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids:
                continue

        try:
            entry = minidom.parse(file)

            # verify that this entry is currently subscribed to and that the
            # number of entries contributed by this feed does not exceed
            # config.new_feed_items
            entry.normalize()
            sources = entry.getElementsByTagNameNS(atomNS, 'source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids:
                    id = ids[0].childNodes[0].nodeValue
                    count[id] = count.get(id, 0) + 1
                    if new_feed_items and count[id] > new_feed_items:
                        continue

                    if id not in sub_ids:
                        ids = sources[0].getElementsByTagName('planet:id')
                        if not ids:
                            continue
                        id = ids[0].childNodes[0].nodeValue
                        if id not in sub_ids:
                            log.warn('Skipping: ' + id)
                        if id not in sub_ids:
                            continue

# Twitter integration
            if config.post_to_twitter():
                url = None
                twitter = None
                title = "Untitled post..."
                links = entry.getElementsByTagName('link')
                if links:
                    for link in links:
                        if link.hasAttribute('rel') and link.hasAttribute(
                                'type') and link.hasAttribute('href'):
                            if (link.getAttribute('rel') == 'alternate' and
                                    link.getAttribute('type') == 'text/html'):
                                url = link.getAttribute('href')
                                break

                titles = entry.getElementsByTagName('title')
                if titles:
                    title = unicode(
                        titles[0].firstChild.nodeValue.encode('utf-8'),
                        'utf-8').strip()
                handles = entry.getElementsByTagName('planet:twitter')
                if (handles):
                    twitter = unicode(
                        handles[0].firstChild.nodeValue.encode('utf-8'),
                        "utf-8")

                if url is not None and url not in posted_urls:
                    #		    log.debug("Going to post URL to Twitter: twitter='{}' title='{}', url='{}'".format(twitter, title, url))
                    txt_append = u''
                    if twitter:
                        txt_append = u" (by @" + twitter.encode(
                            'utf-8').strip() + u")"
                    max_title_len = 280 - 20 - len(txt_append)
                    if (len(title) > max_title_len):
                        title = title[:max_title_len]
                    txt = title + txt_append + u"\n" + url

                    log.debug(u"Text to post '{}'".format(txt))
                    try:
                        posted_urls.add(url)
                        config.twitter_api.update_status(txt)
                    except Exception as ex:
                        log.error(u"Error posting to Twitter: %s", ex)

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items:
                break
        except Exception as ex:
            log.error("Error parsing %s: %s", file, ex)
            exc_type, exc_value, exc_traceback = sys.exc_info()
            traceback.print_exception(exc_type,
                                      exc_value,
                                      exc_traceback,
                                      limit=2,
                                      file=sys.stdout)

    if config.post_to_twitter():
        with open(posted_urls_file, 'wb') as f:
            pickle.dump(posted_urls, f, protocol=pickle.HIGHEST_PROTOCOL)

    if index:
        index.close()

    return doc
예제 #6
0
def splice():
    """ Splice together a planet from a cache of entries """
    import planet
    log = planet.getLogger(config.log_level(),config.log_format())

    log.info("Loading cached data")
    cache = config.cache_directory()
    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

    max_items=max([config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
    feed = doc.documentElement

    # insert feed information
    createTextElement(feed, 'title', config.name())
    date(feed, 'updated', time.gmtime())    
    gen = createTextElement(feed, 'generator', config.generator())
    gen.setAttribute('uri', config.generator_uri())

    author = doc.createElement('author')
    createTextElement(author, 'name', config.owner_name())
    createTextElement(author, 'email', config.owner_email())
    feed.appendChild(author)

    if config.feed():
        createTextElement(feed, 'id', config.feed())
        link = doc.createElement('link')
        link.setAttribute('rel', 'self')
        link.setAttribute('href', config.feed())
        if config.feedtype():
            link.setAttribute('type', "application/%s+xml" % config.feedtype())
        feed.appendChild(link)

    if config.link():
        link = doc.createElement('link')
        link.setAttribute('rel', 'alternate')
        link.setAttribute('href', config.link())
        feed.appendChild(link)

    # insert subscription information
    sub_ids = []
    feed.setAttribute('xmlns:planet',planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue
        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

    index = idindex.open()

    # insert entry information
    items = 0
    for mtime,file in dir:
        if index != None:
            base = os.path.basename(file)
            if index.has_key(base) and index[base] not in sub_ids: continue

        try:
            entry=minidom.parse(file)

            # verify that this entry is currently subscribed to
            entry.normalize()
            sources = entry.getElementsByTagName('source')
            if sources:
                ids = sources[0].getElementsByTagName('id')
                if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
                    ids = sources[0].getElementsByTagName('planet:id')
                    if not ids: continue
                    if ids[0].childNodes[0].nodeValue not in sub_ids: continue

            # add entry to feed
            feed.appendChild(entry.documentElement)
            items = items + 1
            if items >= max_items: break
        except:
            log.error("Error parsing %s", file)

    if index: index.close()

    return doc