예제 #1
0
def tumblr_scraper(base_url,db_name,num_images,start_offset=0,limit=20,url_type='blog'):
    #init with some key
    t = tumblpy.Tumblpy(app_key = 'V55FKUe1lMSdx0UyGSFknmO8DoSaeNzT9oByUwOE1Hvp7diQJ7',
                        app_secret = 'TD9eTgRhoo8ceu0cjcF0nROWAAMkst1uAkSx5XuSOjnYxrGq50',
                        callback_url = 'whatever.com/notimportant_now')

    #we don't need this code
    auth_props = t.get_authentication_tokens()
    auth_url = auth_props['auth_url']
    oauth_token = auth_props['oauth_token']
    oauth_token_secret = auth_props['oauth_token_secret']

    #running
    #get db connection
    print "Connecting to %s" % db_name
    conn = ts_model.touch_db(db_name)
    c = conn.cursor()
    #scraping...
    print "Scraping %s" % base_url
    n = 0
    i = 0
    while n < num_images :
        #get the posts
        print "Get posts %i to %i" % (i*limit+start_offset,(1+i)*limit+start_offset)
        if url_type == 'blog':
           posts = t.get('posts',blog_url=base_url,params={'limit':limit, 'offset':i*limit+start_offset})
        i += 1
        for p in posts['posts']:
          #some posts don't have photo
          if(not('photos' in  p)): continue
          #some posts have more than one image, we will ignore that for now
          if(len(p['photos']) != 1): continue
          #some posts don't have tag
          if(len(p['tags']) == 0): continue
          # If we made it through that, we have a new photo
          n += 1
          #print out the info, move to DB later
          note_count = p['note_count']
          tags = [ y.strip().lower() for x in p['tags']
                                     for y in x.split('\n') ]
          url = p['photos'][0]['original_size']['url']
          #if this is slow, switch to batch execute instead
          print "Found %s %i: %s %s" % (sys.argv[1],n,url,"#" + " #".join(tags))
          ts_model.add_tags(c, tags)
          ts_model.add_photo(c, url, note_count)
          ts_model.link_tags_photo(c, tags, url)
          conn.commit()
    conn.close()
예제 #2
0
def tumblr_scraper(base_url,db_name,num_images,start_offset=0,limit=20,url_type='blog',tag=None,timeout=1000):
    #init with some key
    t = tumblpy.Tumblpy(app_key = 'V55FKUe1lMSdx0UyGSFknmO8DoSaeNzT9oByUwOE1Hvp7diQJ7',
                        app_secret = 'TD9eTgRhoo8ceu0cjcF0nROWAAMkst1uAkSx5XuSOjnYxrGq50',
                        callback_url = 'whatever.com/notimportant_now')

    #we don't need this code
    auth_props = t.get_authentication_tokens()
    auth_url = auth_props['auth_url']
    oauth_token = auth_props['oauth_token']
    oauth_token_secret = auth_props['oauth_token_secret']

    #running
    #get db connection
    print "Connecting to %s" % db_name
    conn = ts_model.touch_db(db_name)
    c = conn.cursor()
    #scraping...
    if url_type == 'blog': print "Scraping %s" % base_url
    else : print "Scraping %s" % tag
    if url_type == 'tag' : i = ts_model.min_time(conn)
    else : i = 0
    if ts_model.photo_count(conn) > 0 :
       print "Database has %i entries" % ts_model.photo_count(conn)
    while ts_model.photo_count(conn) < num_images :
        #get the posts
        if url_type == 'blog':
            #print "Get posts %i to %i" % (i*limit+start_offset,(1+i)*limit+start_offset)
            posts = t.get('posts',blog_url=base_url,extra_endpoints='photo', params={'limit':limit, 'offset':i*limit+start_offset})
            posts = posts['posts']
            i = i + 1
        if url_type == 'tag':
            #print "Get posts %i posts before timestamp %i" % (limit,i)
            params = {'limit':limit, 'tag':tag, 'before':i};
            posts = t.get(None,blog_url=None,tag=True, params=params)
        oldi = i
        for p in posts:
          #some posts don't have photo
          if(not('photos' in  p)): continue
          #some posts have more than one image, we will ignore that for now
          if(len(p['photos']) != 1): continue
          #some posts don't have tag
          if(len(p['tags']) == 0): continue
          # If we made it through that, we have a new photo
          n = ts_model.photo_count(conn)
          # we need timestamp
          if tag : i = p['timestamp']
          #print out the info, move to DB later
          tags = [ y.strip().lower() for x in p['tags']
                                     for y in x.split('\n') ]
          url = p['photos'][0]['original_size']['url']
          # if this is slow, switch to batch execute instead
          if 'note_count' in p.keys():
              note_count = p['note_count']
          else:
              note_count = -1
          ts_model.add_tags(c, tags)
          ts_model.add_photo(c, url, note_count, p['timestamp'])
          ts_model.link_tags_photo(c, tags, url)

          if n != ts_model.photo_count(conn) and tag:
            print "@ %i found %s %i (notes=%i): %s %s" % (i, tag, n, note_count, url,
                                                               "#" + " #".join(tags))
          conn.commit()
        # Decrement the timestamp if it didn't change
        if oldi == i: i -= limit
    conn.close()