Python Journo примеры использования

Язык программирования: Python

Класс/Тип: Journo

Примеров на hotexamples.com: 2

Python Journo - 2 примера найдено. Это лучшие примеры Python кода для Journo, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

find_or_create(1)

update_activation(1)

Пример #1

Показать файл

Файл: ScraperUtils.py Проект: johncrumpton/journalisted

def scrape_articles( found, extract, opts):
    """Scrape list of articles, return error counts.

    found -- list of article contexts to scrape
    extract -- extract function
    opts:
        max_errors -- tolerated number of errors before bailing
        test
        force_rescrape
        etc...
    """

    extralogging = False
    max_errors = getattr(opts,'max_errors',0)
    expected_journo = getattr(opts,'expected_journo',None)

    # remove dupes (eg often articles appear in more than one RSS feed)
    found = unique_articles(found)

    # randomise the order of articles, so that if the scraper does abort
    # due to too many errors, successive runs should be able to pick up
    # all the scrapable articles.
    random.shuffle(found)
    #assert(len(found)>0)
    ukmedia.DBUG2("%d articles to scrape\n" % (len(found)))

    if opts.test:
        ukmedia.DBUG("DRY RUN\n")

    store = ArticleDB.ArticleDB()


    failcount = 0
    abortcount = 0
    newcount = 0
    had_count = 0
    rescrape_count = 0

    for context in found:

        try:
            known_urls = set((context['srcurl'], context['permalink']))
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)


            #ukmedia.DBUG2( u"fetching %s\n" % (context['srcurl']) )
            resp = urllib2.urlopen( context['srcurl'] )

            # is the server sending an charset encoding?
            kwargs = {}
            content_type = resp.info().getheader('Content-Type','')
            m = re.compile(r';\s*charset\s*=\s*([^;]*)', re.I).search(content_type)
            if m:
                kwargs['encoding'] = m.group(1)

            # grab the content
            html = resp.read()

            # add any URLs we were redirected via...
            for code,url in resp.redirects:
                known_urls.add(url)
                if code==301:    # permanant redirect
                    context['permalink'] = url

            # check html for a rel="canonical" link:
            canonical_url = extract_canonical_url(html, context['permalink'])
            if canonical_url is not None:
                known_urls.add(canonical_url)
                context['permalink'] = canonical_url

            # strip off "?rss=yes" etc from permalink
            tidied_url = tidy_url(context['permalink'])
            if tidied_url != context['permalink']:
                context['permalink'] = tidied_url
                known_urls.add(tidied_url)

            context['urls'] = known_urls

            # check that all urls are OK (eg express.co.uk have a habit of publishing borked ones for blogs)
            for url in known_urls:
                url.encode('utf-8') # will raise an exception if dud

            # repeat url-based existence check with the urls we now have
            # TODO: if so, add any new urls... maybe rescrape and update article? 
            article_id = None
            got = store.find_article(known_urls)
            if len(got) > 0:
                if extralogging:
                    for article_id in got:
                        ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id)))
                if not opts.force_rescrape:
                    had_count += 1
                    continue;   # skip it - we've already got it
                else:
                    assert(len(got) == 1)
                    article_id = got[0]

            # some extra, last minute context :-)
            context[ 'lastscraped' ] = datetime.now()

            art = extract(html, context, **kwargs)

            if art:
                # set the srcorg id for the article
                if 'srcorgname' in art and art['srcorgname'] is not None:
                    srcorg = Misc.GetOrgID( art[ 'srcorgname' ] )
                else:
                    # no publication specified - look up using domain name
                    o = urlparse.urlparse(art['permalink'])
                    domain = o[1].lower()
                    srcorg = Publication.find_or_create(domain)
                art['srcorg'] = srcorg


                # resolve bylined authors to journo ids
                authors = Byline.CrackByline(art['byline'])
                attributed = []
                for author in authors:
                    attributed.append(Journo.find_or_create(author, art, expected_journo))
                art['journos'] = attributed

                if opts.test:
                    ukmedia.PrettyDump( art )

                if article_id:
                    # rescraping existing article
                    art['id'] = article_id
                    article_id = store.upsert( art )
                    rescrape_count += 1
                else:
                    #
                    article_id = store.upsert( art )
                    newcount += 1



                if opts.test:
                    DB.conn().rollback()
                else:
                    DB.conn().commit()


        except Exception, err:
            DB.conn().rollback()

            # always just bail out upon ctrl-c
            if isinstance( err, KeyboardInterrupt ):
                raise

            failcount = failcount+1
            # TODO: phase out NonFatal! just get scraper to print out a warning message instead
            if isinstance( err, ukmedia.NonFatal ):
                continue

            report = traceback.format_exc()

            if 'title' in context:
                msg = u"FAILED (%s): '%s' (%s)" % (err, context['title'], context['srcurl'])
            else:
                msg = u"FAILED (%s): (%s)" % (err,context['srcurl'])
            ukmedia.DBUG( msg + "\n" )
            ukmedia.DBUG2( report + "\n" )
            ukmedia.DBUG2( '-'*60 + "\n" )

            abortcount = abortcount + 1
            if abortcount > max_errors:
                print >>sys.stderr, "Too many errors - ABORTING"
                raise

Пример #2

Показать файл

Файл: ArticleDB.py Проект: 7t6g/journalisted

    def upsert( self, art ):
        """Insert or update an article"""


        # if no separate 'urls' set, create it
        if not 'urls' in art:
            art['urls'] = set((art['permalink'], art['srcurl']))

        # fill in some defaults if missing
        if 'lastscraped' not in art:
            art['lastscraped'] = datetime.now()
        if 'lastseen' not in art:
            art['lastseen'] = datetime.now()
        if 'description' not in art:
            art['description'] = ukmedia.FirstPara(art['content'])

        CheckArticle( art )

        # send text to the DB as utf-8
        title = art['title'].encode( 'utf-8' )
        byline = art[ 'byline' ].encode( 'utf-8' )
        description = art['description'].encode( 'utf-8' )
        pubdate = "%s" %(art['pubdate'])
        lastscraped = "%s" % (art['lastscraped'])
        lastseen = "%s" % (art['lastseen'])
        firstseen = lastseen    # it's a new entry
        srcurl = art['srcurl']
        permalink = art['permalink']
        srcorg = art['srcorg']

        # phasing out srcid...
        if 'srcid' in art:
            srcid = art['srcid']
        else:
            srcid = art['permalink']

        wordcount = None
        content = None
        # does article include content?
        if 'content' in art:
            content = art['content'].encode( 'utf-8' )
            # noddy wordcount
            txt = ukmedia.StripHTML( art['content'] )
            wordcount = len( txt.split() );

        # send to db!
        cursor = DB.conn().cursor()



        updating = False
        if 'id' in art:
            updating = True

        if updating:
            # update existing
            article_id = art['id']
            q = 'UPDATE article SET (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) = (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) WHERE id=%s'
            cursor.execute(q, (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped, article_id))
        else:
            # insert new
            q = 'INSERT INTO article (title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            cursor.execute( q, ( title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped ) )
            # get the newly-allocated id
            cursor.execute( "select currval('article_id_seq')" )
            article_id = cursor.fetchone()[0]


        # add the known urls for the article
        if updating:
            cursor.execute( "DELETE FROM article_url WHERE article_id=%s", (article_id,))
        for url in set(art['urls']):
            cursor.execute( "INSERT INTO article_url (url,article_id) VALUES (%s,%s)", (url,article_id))

        # update content, if included
        if content is None:
            insert_content = False
        else:
            insert_content = True
            if updating:
                # TODO: keep multiple revisions to track changes
                # has the content actually changed?
                cursor.execute("SELECT id FROM article_content WHERE article_id=%s AND content=%s", (article_id,content))
                foo = cursor.fetchall()     # gah... couldn't get cursor.rowcount to work...
                if len(foo)>=1:
                    # no change, so just leave it as is
                    insert_content = False

        if insert_content:
            cursor.execute("DELETE FROM article_content WHERE article_id=%s", (article_id,))
            q = 'INSERT INTO article_content (article_id, content,scraped) VALUES ( %s,%s,%s )'
            cursor.execute(q, (article_id, content, lastscraped))

        # queue it for xapian indexing
        cursor.execute("DELETE FROM article_needs_indexing WHERE article_id=%s", (article_id,))
        cursor.execute("INSERT INTO article_needs_indexing (article_id) VALUES (%s)", (article_id,))

        # if there was a scraper error entry for this article, delete it now
        cursor.execute( "DELETE FROM error_articlescrape WHERE srcid=%s", (srcid,) )

        # if there were images, add them too
        if updating:
            cursor.execute("DELETE FROM article_image WHERE article_id=%s", (article_id,))
        if 'images' in art:
            for im in art['images']:
                cap = im['caption'].encode('utf-8')
                cred = ''
                if 'credit' in im:
                    cred = im['credit'].encode('utf-8')
                cursor.execute("INSERT INTO article_image (article_id,url,caption,credit) VALUES (%s,%s,%s,%s)",
                    (article_id, im['url'], cap, cred))

        # if there were commentlinks, add them too
        if 'commentlinks' in art:
            for c in art['commentlinks']:
                c['source'] = art['srcorgname']
                CommentLink.upsert(article_id, c)

        # add tags
        Tags.generate(article_id, art['content'])

        # attribute journos
        assert 'journos' in art
        cursor.execute("DELETE FROM journo_attr WHERE article_id=%s", (article_id,))
        for journo_id in art['journos']:
            cursor.execute("INSERT INTO journo_attr (journo_id,article_id) VALUES (%s,%s)", (journo_id,article_id))

            # make sure journo activates if they meet the criteria
            Journo.update_activation(journo_id)

            # also clear the html cache for that journos page
            cachename = 'j%s' % (journo_id)
            cursor.execute( "DELETE FROM htmlcache WHERE name=%s", (cachename,) )


        op = 'update' if updating else 'new'
        if insert_content:
            op += ' meta+content'
        else:
            op += ' meta'

        ukmedia.DBUG2( u"%s: %s [a%s %s ] ('%s' %s)\n" % (
            art['srcorgname'] if 'srcorgname' in art else srcorg,
            op,
            article_id,
            art['srcurl'],
            art['byline'],
            ','.join( [ '[j%s]'%(j) for j in art['journos'] ] )
            ))
        return article_id