pvldb-announce.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import re
import urllib
import logging
import pytz
import time
import argparse
import sqlite3
import twitter
from datetime import datetime
from datetime import tzinfo
from pprint import pprint
from feedgen.feed import FeedGenerator
from bs4 import BeautifulSoup

## ==============================================
## LOGGING
## ==============================================
LOG = logging.getLogger(__name__)
LOG_handler = logging.StreamHandler()
LOG_formatter = logging.Formatter(fmt='%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s',
                                  datefmt='%m-%d-%Y %H:%M:%S')
LOG_handler.setFormatter(LOG_formatter)
LOG.addHandler(LOG_handler)
LOG.setLevel(logging.INFO)

## ==============================================
## CONFIGURATION
## ==============================================

RSS_TITLE = 'PVLDB Paper Announcements'
RSS_AUTHOR = {'name':'Andy Pavlo','email':'pavlo@cs.cmu.edu'}
RSS_SUBTITLE = 'Generated by the Carnegie Mellon Database Group'
RSS_FILE = "pvldb-rss.xml"
RSS_URL = "http://db.cs.cmu.edu/files/" + RSS_FILE
BASE_URL = "http://www.vldb.org/pvldb/"

DB_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pvldb.db")

TWITTER_SLEEP_TIME = 1200 # seconds

dateFormat = "%B %Y"
dateRe = re.compile("Volume ([\d]+), No\.[\s]?([\d]+), ([A-Z][a-z]+ [\d]{4})")

SKIP = set([ "vol%d.html" % x for x in xrange(1, 5) ])

VOLUME_LABELS = { }

## ==============================================
## getVolumeUrls
## ==============================================
def getVolumeUrls(url):
    volumes = [ ]
    r = urllib.urlopen(url).read()
    soup = BeautifulSoup(r, "lxml")
    regex = re.compile("vol[\d]+\.html")
    for a in soup.find_all('a'):
        m = regex.match(a["href"])
        if m and not a["href"] in volumes:
            if a["href"] in SKIP:
                logging.warn("Skipping '%s'" % a["href"])
                continue
            volumes.append(a["href"])
    ## FOR
    return [ BASE_URL + x for x in volumes ]
## DEF

## ==============================================
## getPapers
## ==============================================
def getPapers(vol_url):
    LOG.debug("Retreiving papers for %s" % vol_url)
    
    r = urllib.urlopen(vol_url).read()
    soup = BeautifulSoup(r, "lxml")
    
    papers = { }
    for s in soup.find_all('h2'):
        sectionDate = None
        m = dateRe.match(s.text)
        LOG.debug("Processing header '%s'" % s.text)
        if m:
            sectionDate = datetime.strptime(m.groups()[2], dateFormat)
            volume = int(m.groups()[0])
            number = int(m.groups()[1])
            VOLUME_LABELS[sectionDate] = (volume, number)
        assert not sectionDate is None
        
        if not sectionDate in papers:
            papers[sectionDate] = [ ]
        
        ul_search = s.find_next('ul')
        # VOL6 Fix
        if vol_url.find("vol6.html") != -1:
            ul_search = ul_search.find_next('ul')
            
        for u in ul_search:
            skip = False
            try:
                for p in u.parent.find_all('li'):
                    if p.text.find("Editors-in-Chief") != -1:
                        skip = True
                    if skip: continue
                        
                    url = None
                    title = None
                    authors = None
                    
                    link = p.find("a")
                    if link and not p.next_element is None:
                        if link.text.find("Front Matter") != -1:
                            continue
                        authors = p.next_element
                        try:
                            authors = authors.strip()[:-1]
                        except:
                            LOG.error("authors=" + str(type(authors)))
                            LOG.error("authors.text=" + str(authors.text))
                            raise
                        url = link["href"]
                        if not url.startswith("http://"):
                            url = BASE_URL + url
                        title = link.__dict__["contents"][0].replace("\n", " ").strip()
                        
                        #print pprint(dir(link))
                        #print
                    #print
                    if url is None: continue
                    papers[sectionDate].append({
                        "authors":      authors,
                        "title":        title,
                        "volume":       volume,
                        "number":       number,
                        "link":         url,
                        "published":    sectionDate.replace(tzinfo=pytz.utc)
                    })
                ## FOR
                break
            except:
                LOG.error("Unexpected error for section '" + s.text + "'")
                LOG.error("link=" + str(link))
                LOG.error("p=" + str(p))
                #LOG.error("authors=" + str(authors))
                raise
        ## FOR
    ## FOR
    return (papers)
## DEF

## ==============================================
## writeRSS
## ==============================================
def writeRSS(papers, output):
    fg = FeedGenerator()
    fg.id(RSS_URL)
    fg.title(RSS_TITLE)
    fg.subtitle(RSS_SUBTITLE)
    fg.author(RSS_AUTHOR)
    fg.link( href='http://www.vldb.org/pvldb/', rel='alternate' )
    fg.language('en')
    
    for p in papers:
        summary = "%(title)s\nAuthors: %(authors)s\nPVLDB Volume %(volume)d, Number %(number)d" % p
        
        fe = fg.add_entry()
        fe.author(name=p["authors"])
        fe.title(p["title"])
        fe.link(href=p["link"]) 
        fe.id(p["link"])
        fe.published(published=p["published"])
        fe.description(description=summary, isSummary=True)
    ## FOR
    
    atomfeed = fg.atom_str(pretty=True) # Get the ATOM feed as string
    atom_file = os.path.join(output, 'pvldb-atom.xml')
    fg.atom_file(atom_file) # Write the ATOM feed to a file
    LOG.info("Created ATOM '%s'" % atom_file)
    
    rssfeed  = fg.rss_str(pretty=True) # Get the RSS feed as string
    rss_file = os.path.join(output, RSS_FILE)
    fg.rss_file(rss_file) # Write the RSS feed to a file
    LOG.info("Created RSS '%s'" % rss_file)
## DEF

## ==============================================
## postTwitter
## ==============================================
def postTwitter(args, db, paper):
    LOG.info("Posting paper '%s' to twitter!" % paper["title"])
    
    api = twitter.Api(consumer_key=args["twitter_consumer_key"],
                      consumer_secret=args["twitter_consumer_secret"],
                      access_token_key=args["twitter_access_token"],
                      access_token_secret=args["twitter_access_secret"])
    
    #paper["separator"] = u"→".encode('unicode-escape')
      
    tweet = u"Vol:%(volume)d No:%(number)d → %(title)s" % paper
    if len(tweet)+24 > 140:
        remaining = 140 - (len(tweet)+24)
        tweet = tweet[:remaining-3] + u"..."
    tweet += " " + paper["link"]
    
    LOG.debug("%s [Length=%d]" % (tweet, len(tweet)))

    status = api.PostUpdate(tweet)
    LOG.info("Posted tweet [status=%s]", str(status))
    
    cur = db.cursor()
    sql = "UPDATE papers SET twitter = 1 WHERE link = ?"
    cur.execute(sql, (paper["link"], ))
    db.commit()
    
## DEF


## ==============================================
## createDatabase
## ==============================================
def createDatabase():
    db = sqlite3.connect(DB_PATH)
    cur = db.cursor()
    
    sql = """
    CREATE TABLE papers (
        link VARCHAR(255) PRIMARY KEY,
        title TEXT NOT NULL,
        authors TEXT NOT NULL,
        volume INT NOT NULL,
        number INT NOT NULL,
        published DATE NOT NULL,
        twitter INT NOT NULL DEFAULT 0
    );"""
    cur.execute(sql)
    db.commit()
    db.close()
    
## FOR


## ==============================================
## main
## ==============================================
if __name__ == '__main__':
    aparser = argparse.ArgumentParser(description='PVLDB Announcements Script')
    aparser.add_argument('dbpath', help='Database Path')
    aparser.add_argument("--debug", action='store_true')

    ## Collection Parameters
    agroup = aparser.add_argument_group('Collection Parameters')
    agroup.add_argument('--collect', action='store_true', help='Collect results from PVLDB website')

    ## RSS Parameters
    agroup = aparser.add_argument_group('RSS Parameters')
    agroup.add_argument('--rss', action='store_true', help='Genereate RSS/Atom file')
    agroup.add_argument('--rss-path', type=str, help='RSS output directory')

    ## Twitter Parameters
    agroup = aparser.add_argument_group('RSS Parameters')
    agroup.add_argument('--twitter', action='store_true', help='Post announcements on Twitter')
    agroup.add_argument('--twitter-consumer-key', type=str, help='Twitter Consumer Key')
    agroup.add_argument('--twitter-consumer-secret', type=str, help='Twitter Consumer Secret')
    agroup.add_argument('--twitter-access-token', type=str, help='Twitter Access Token Key')
    agroup.add_argument('--twitter-access-secret', type=str, help='Twitter Access Token Secret')
    
    args = vars(aparser.parse_args())

    ## ----------------------------------------------
    
    if args['debug']:
        LOG.setLevel(logging.DEBUG)

    # If they want to post to twitter, make sure they give us all the info
    # that we need to do this
    if args["twitter"]:
        LOG.debug("Checking twitter input arguments")
        for k in args.keys():
            if k.startswith("twitter") and args[k] is None:
                LOG.error("Missing '%s' input parameter for Twitter" % k)
                sys.exit(1)
        ## FOR
    ## IF

    ## ----------------------------------------------
    
    # Create the database if we don't have it
    if not os.path.exists(DB_PATH):
        createDatabase()
    db = sqlite3.connect(DB_PATH)
    cur = db.cursor()
        
    # Get the volume URLs
    if args["collect"]:
        volumes = getVolumeUrls(BASE_URL)
        papers = { }
        for v in volumes:
            try:
                p = getPapers(v)
                papers.update(p)
            except:
                LOG.error("Unexpected error for " + v)
                raise
    
        # Figure out what papers are new
        for d in reversed(sorted(papers.keys())):
            for p in papers[d]:
                sql = "SELECT * FROM papers WHERE link = ?"
                cur.execute(sql, (p["link"],))
                row = cur.fetchone()
                if row is None:
                    LOG.debug("Adding %s" % p["link"])
                    
                    sql = """INSERT INTO papers (
                                link, title, authors, volume, number, published
                            ) VALUES (
                                ?, ?, ?, ?, ?, ?)"""
                    cur.execute(sql, (p["link"], p["title"], p["authors"], p["volume"], p["number"], p["published"],))
            ## FOR
        ## FOR
        db.commit()
    ## IF

    ## Post new papers to Twitter
    if args["twitter"]:
        sql = "SELECT * FROM papers WHERE twitter = 0 ORDER BY link"
        new_papers = [ ]
        for row in cur.execute(sql):
            paper = {
                "link":     row[0],
                "title":    row[1],
                "authors":  row[2],
                "volume":   row[3],
                "number":   row[4],
                "published":row[5],
            }
            new_papers.append(paper)
        ## FOR
        for paper in new_papers:
            postTwitter(args, db, paper)
            LOG.warn("Sleeping for %d seconds..." % TWITTER_SLEEP_TIME)
            time.sleep(TWITTER_SLEEP_TIME)
        ## FOR
    ## IF

    # Always create the RSS files from scratch
    if args["rss"]:
        assert args["rss_path"]
        
        sql = "SELECT * FROM papers ORDER BY volume DESC, number DESC, link"
        papers = [ ]
        for row in cur.execute(sql):
            paper = {
                "link":     row[0],
                "title":    row[1],
                "authors":  row[2],
                "volume":   row[3],
                "number":   row[4],
                "published":row[5],
            }
            papers.append(paper)
        ## FOR
        writeRSS(papers, args["rss_path"])
    ## IF
    
    db.close()
## MAIN