示例#1
0
def upsert_rss_pub(article_url, article_slug, value):
  if not db.sismember('article_set', article_url):
    # add it to the set
    db.sadd('article_set', article_url)
    
  key = "%s:article" % article_slug
  db.set(key, value)
示例#2
0
def upsert_url(article_url, article_slug, data_source, config):
  if not db.sismember('article_set', article_url):
    # add it to the set
    db.sadd('article_set', article_url)

    # insert metadata
    ts = current_timestamp(config)
    value = json.dumps({
      "url" : article_url,
      "slug": article_slug,
      "timestamp" : ts,
      "data_source": data_source
      })
    
    db.zadd('article_sorted_set', ts, value)
示例#3
0
def insert_new_post(post_arg_set):
  """
  insert new post into redis
  """
  api, post_data, acct_data, page_id, config = post_arg_set

  try:
    post_id = post_data['id'] if post_data.has_key('id') else None
  except Exception as e:
    print e
  else:
    if is_insights(page_id, config):
      insights_value = get_insights_data(api, page_id, post_id)
    else:
      insights_value = {}
    
    # parse date
    if post_data.has_key('created_time') and post_data['created_time'] is not None:  
      dt = datetime.strptime(post_data['created_time'], FB_DATE_FORMAT)
      date_time = tz_adj(dt, config)
      time_bucket = round_datetime(date_time, config)
      raw_timestamp = int(date_time.strftime("%s"))
    
    else:
      time_bucket = None
      raw_timestamp = None
    
    # extract message so we can find links within the msg if not in url
    article_urls = [get_fb_link(post_data, config, unshorten=True)]
    message = post_data['message'].encode('utf-8') if post_data.has_key('message') else None
    message_urls = get_message_urls(article_urls, message, config)

    # detect article links, unshorten and parse
    article_urls = [
      parse_url(unshorten_link(article_url, config)) \
      for article_url in article_urls + message_urls
      if article_url is not None and is_article(article_url, config)
    ]

    if article_urls:
      for article_url in set(article_urls):

        # sluggify url
        article_slug = sluggify(article_url)

        # format data
        post_value = {
          'article_slug': article_slug,
          'article_url': article_url,
          'time_bucket': time_bucket,
          'fb_post_created': time_bucket,
          'raw_timestamp': raw_timestamp,
          'fb_raw_link' : get_fb_link(post_data, config=config),
          'fb_page_id': page_id,
          'fb_post_id': post_id,
          'fb_page_likes': acct_data['likes'] if acct_data.has_key('likes') else None,
          'fb_page_talking_about': acct_data['talking_about_count'] if acct_data.has_key('talking_about_count') else None,
          'fb_type': post_data['type'] if post_data.has_key('type') else None,
          'fb_status_type': post_data['status_type'] if post_data.has_key('status_type') else None,
          'fb_message': message
        }
          
        # always insert insights data
        if is_insights(page_id, config):
          print "INFO\tINSIGHTS\tAdding data from %s re: %s" % (page_id, article_slug)
          # 
          data_source = "facebook_insights_%s" % page_id 
          # upsert url
          upsert_url(article_url, article_slug, data_source, config)

          # insert id
          db.sadd('facebook_post_ids', post_id)

          # format time bucket
          current_time_bucket = gen_time_bucket(config)
          insights_value.pop('time_bucket', current_time_bucket)
          post_value.pop('time_bucket', None)
          
          value = json.dumps({
            data_source : dict(post_value.items() + insights_value.items())
          })

          # upload data to redis
          db.zadd(article_slug, current_time_bucket, value)        
            
        # only insert new posts
        elif not db.sismember('facebook_post_ids', post_id):
          
          print "INFO\tFACEBOOK\tnew post %s re: %s" % (post_id, article_slug)
          
          # insert id
          db.sadd('facebook_post_ids', post_id)     
          
          # upsert url
          data_source = "facebook_%s" % page_id
          upsert_url(article_url, article_slug, data_source, config)

          value = json.dumps({
            data_source : dict(post_value.items() + insights_value.items())
          })

          # upload data to redis
          db.zadd(article_slug, time_bucket, value)
示例#4
0
def parse_tweet(tweet_arg_set):
  slug, t, config = tweet_arg_set

  # check if id exists
  twt_id = t.id_str
  if not db.sismember('twitter_twt_ids', twt_id):

    # if not, add id to id_set
    db.sadd('twitter_twt_ids', twt_id)
    
    # check for relevant urls
    raw_urls = [u['expanded_url'] for u in t.entities['urls']]

    # parse urls
    article_urls = set([parse_url(unshorten_link(u, config)) for u in raw_urls])

    if any([is_article(u, config) for u in article_urls]):

      # parse dates
      # sometimes t.created_at is a datetime object
      if isinstance(t.created_at, datetime):
        dt = t.created_at
      else:
        dt = datetime.strptrim(t.created_at, TWT_DATE_FORMAT)
      
      date_time = tz_adj(dt, config)
      time_bucket = round_datetime(date_time, config) if date_time is not None else None
      
      raw_timestamp = int(date_time.strftime('%s'))
      

      for article_url in article_urls:
        # sluggify url
        article_slug = sluggify(article_url)
        screen_name = t.user.screen_name
        log.info( "TWITTER\tNew Tweet %s/%s\t%s" % (screen_name, twt_id, article_url) )

      # format data
        value = {
          'article_slug': article_slug,
          'article_url': article_url,
          'time_bucket': time_bucket,
          'raw_timestamp' :  raw_timestamp,
          'twt_list' : slug,
          'twt_post_created': raw_timestamp,
          'twt_id': twt_id,
          'twt_screen_name': t.user.screen_name,
          'twt_text': t.text,
          'twt_followers': t.author.followers_count,
          'twt_friends': t.author.friends_count,
          'twt_lang': t.lang,    
          'twt_raw_links': raw_urls,
          'twt_hashtags': t.entities['hashtags'],
          'twt_user_mentions': t.entities['user_mentions'],
          'twt_in_reply_to_screen_name': t.in_reply_to_screen_name,
          'twt_in_reply_to_status_id_str': t.in_reply_to_status_id_str
        }
        
        data_source = "twitter_%s" % slug
        
        # upsert url
        upsert_url(article_url, article_slug, data_source, config)

        value = json.dumps({ data_source : value})
        
        # add data to redis
        db.zadd(article_slug, time_bucket, value)