示例#1
0
    def upsert( self, art ):
        """Insert or update an article"""


        # if no separate 'urls' set, create it
        if not 'urls' in art:
            art['urls'] = set((art['permalink'], art['srcurl']))

        # fill in some defaults if missing
        if 'lastscraped' not in art:
            art['lastscraped'] = datetime.now()
        if 'lastseen' not in art:
            art['lastseen'] = datetime.now()
        if 'description' not in art:
            art['description'] = ukmedia.FirstPara(art['content'])

        CheckArticle( art )

        # send text to the DB as utf-8
        title = art['title'].encode( 'utf-8' )
        byline = art[ 'byline' ].encode( 'utf-8' )
        description = art['description'].encode( 'utf-8' )
        pubdate = "%s" %(art['pubdate'])
        lastscraped = "%s" % (art['lastscraped'])
        lastseen = "%s" % (art['lastseen'])
        firstseen = lastseen    # it's a new entry
        srcurl = art['srcurl']
        permalink = art['permalink']
        srcorg = art['srcorg']

        # phasing out srcid...
        if 'srcid' in art:
            srcid = art['srcid']
        else:
            srcid = art['permalink']

        wordcount = None
        content = None
        # does article include content?
        if 'content' in art:
            content = art['content'].encode( 'utf-8' )
            # noddy wordcount
            txt = ukmedia.StripHTML( art['content'] )
            wordcount = len( txt.split() );

        # send to db!
        cursor = DB.conn().cursor()



        updating = False
        if 'id' in art:
            updating = True

        if updating:
            # update existing
            article_id = art['id']
            q = 'UPDATE article SET (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) = (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) WHERE id=%s'
            cursor.execute(q, (title, byline, description, lastscraped, pubdate, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped, article_id))
        else:
            # insert new
            q = 'INSERT INTO article (title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, last_comment_check) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
            cursor.execute( q, ( title, byline, description, lastscraped, pubdate, firstseen, lastseen, permalink, srcurl, srcorg, srcid, wordcount, lastscraped ) )
            # get the newly-allocated id
            cursor.execute( "select currval('article_id_seq')" )
            article_id = cursor.fetchone()[0]


        # add the known urls for the article
        if updating:
            cursor.execute( "DELETE FROM article_url WHERE article_id=%s", (article_id,))
        for url in set(art['urls']):
            cursor.execute( "INSERT INTO article_url (url,article_id) VALUES (%s,%s)", (url,article_id))

        # update content, if included
        if content is None:
            insert_content = False
        else:
            insert_content = True
            if updating:
                # TODO: keep multiple revisions to track changes
                # has the content actually changed?
                cursor.execute("SELECT id FROM article_content WHERE article_id=%s AND content=%s", (article_id,content))
                foo = cursor.fetchall()     # gah... couldn't get cursor.rowcount to work...
                if len(foo)>=1:
                    # no change, so just leave it as is
                    insert_content = False

        if insert_content:
            cursor.execute("DELETE FROM article_content WHERE article_id=%s", (article_id,))
            q = 'INSERT INTO article_content (article_id, content,scraped) VALUES ( %s,%s,%s )'
            cursor.execute(q, (article_id, content, lastscraped))

        # queue it for xapian indexing
        cursor.execute("DELETE FROM article_needs_indexing WHERE article_id=%s", (article_id,))
        cursor.execute("INSERT INTO article_needs_indexing (article_id) VALUES (%s)", (article_id,))

        # if there was a scraper error entry for this article, delete it now
        cursor.execute( "DELETE FROM error_articlescrape WHERE srcid=%s", (srcid,) )

        # if there were images, add them too
        if updating:
            cursor.execute("DELETE FROM article_image WHERE article_id=%s", (article_id,))
        if 'images' in art:
            for im in art['images']:
                cap = im['caption'].encode('utf-8')
                cred = ''
                if 'credit' in im:
                    cred = im['credit'].encode('utf-8')
                cursor.execute("INSERT INTO article_image (article_id,url,caption,credit) VALUES (%s,%s,%s,%s)",
                    (article_id, im['url'], cap, cred))

        # if there were commentlinks, add them too
        if 'commentlinks' in art:
            for c in art['commentlinks']:
                c['source'] = art['srcorgname']
                CommentLink.upsert(article_id, c)

        # add tags
        Tags.generate(article_id, art['content'])

        # attribute journos
        assert 'journos' in art
        cursor.execute("DELETE FROM journo_attr WHERE article_id=%s", (article_id,))
        for journo_id in art['journos']:
            cursor.execute("INSERT INTO journo_attr (journo_id,article_id) VALUES (%s,%s)", (journo_id,article_id))

            # make sure journo activates if they meet the criteria
            Journo.update_activation(journo_id)

            # also clear the html cache for that journos page
            cachename = 'j%s' % (journo_id)
            cursor.execute( "DELETE FROM htmlcache WHERE name=%s", (cachename,) )


        op = 'update' if updating else 'new'
        if insert_content:
            op += ' meta+content'
        else:
            op += ' meta'

        ukmedia.DBUG2( u"%s: %s [a%s %s ] ('%s' %s)\n" % (
            art['srcorgname'] if 'srcorgname' in art else srcorg,
            op,
            article_id,
            art['srcurl'],
            art['byline'],
            ','.join( [ '[j%s]'%(j) for j in art['journos'] ] )
            ))
        return article_id