Exemplo n.º 1
0
def tumblr_scraper(base_url,db_name,num_images,start_offset=0,limit=20,url_type='blog'):
    #init with some key
    t = tumblpy.Tumblpy(app_key = 'V55FKUe1lMSdx0UyGSFknmO8DoSaeNzT9oByUwOE1Hvp7diQJ7',
                        app_secret = 'TD9eTgRhoo8ceu0cjcF0nROWAAMkst1uAkSx5XuSOjnYxrGq50',
                        callback_url = 'whatever.com/notimportant_now')

    #we don't need this code
    auth_props = t.get_authentication_tokens()
    auth_url = auth_props['auth_url']
    oauth_token = auth_props['oauth_token']
    oauth_token_secret = auth_props['oauth_token_secret']

    #running
    #get db connection
    print "Connecting to %s" % db_name
    conn = ts_model.touch_db(db_name)
    c = conn.cursor()
    #scraping...
    print "Scraping %s" % base_url
    n = 0
    i = 0
    while n < num_images :
        #get the posts
        print "Get posts %i to %i" % (i*limit+start_offset,(1+i)*limit+start_offset)
        if url_type == 'blog':
           posts = t.get('posts',blog_url=base_url,params={'limit':limit, 'offset':i*limit+start_offset})
        i += 1
        for p in posts['posts']:
          #some posts don't have photo
          if(not('photos' in  p)): continue
          #some posts have more than one image, we will ignore that for now
          if(len(p['photos']) != 1): continue
          #some posts don't have tag
          if(len(p['tags']) == 0): continue
          # If we made it through that, we have a new photo
          n += 1
          #print out the info, move to DB later
          note_count = p['note_count']
          tags = [ y.strip().lower() for x in p['tags']
                                     for y in x.split('\n') ]
          url = p['photos'][0]['original_size']['url']
          #if this is slow, switch to batch execute instead
          print "Found %s %i: %s %s" % (sys.argv[1],n,url,"#" + " #".join(tags))
          ts_model.add_tags(c, tags)
          ts_model.add_photo(c, url, note_count)
          ts_model.link_tags_photo(c, tags, url)
          conn.commit()
    conn.close()
Exemplo n.º 2
0
def download(db_name, directory, start_id=0):
    #get db connection
    print "Connecting to %s" % db_name
    print start_id
    conn = ts_model.touch_db(db_name)
    c = conn.cursor()
    if not os.path.exists(directory):
        os.makedirs(directory)
    for photo in  ts_model.get_photos(c, start_id):
	#no gif
	if photo[3].split('.')[-1] == 'gif':
		continue
        filename = photo[3].split('/')[-1]
        outpath = os.path.join(directory, filename)
	#never downloaded
	if not os.path.exists(outpath) : 
        	print 'Downloading %s' % photo[3]
        	urlretrieve(photo[3], outpath)
        	print 'Save as %s' % filename
		#check file content
		type = imghdr.what(outpath)
		if not (type == 'png' or type == 'jpeg'):
			print 'File %s has been removed' % filename
			os.remove(outpath)
Exemplo n.º 3
0
def tumblr_scraper(base_url,db_name,num_images,start_offset=0,limit=20,url_type='blog',tag=None,timeout=1000):
    #init with some key
    t = tumblpy.Tumblpy(app_key = 'V55FKUe1lMSdx0UyGSFknmO8DoSaeNzT9oByUwOE1Hvp7diQJ7',
                        app_secret = 'TD9eTgRhoo8ceu0cjcF0nROWAAMkst1uAkSx5XuSOjnYxrGq50',
                        callback_url = 'whatever.com/notimportant_now')

    #we don't need this code
    auth_props = t.get_authentication_tokens()
    auth_url = auth_props['auth_url']
    oauth_token = auth_props['oauth_token']
    oauth_token_secret = auth_props['oauth_token_secret']

    #running
    #get db connection
    print "Connecting to %s" % db_name
    conn = ts_model.touch_db(db_name)
    c = conn.cursor()
    #scraping...
    if url_type == 'blog': print "Scraping %s" % base_url
    else : print "Scraping %s" % tag
    if url_type == 'tag' : i = ts_model.min_time(conn)
    else : i = 0
    if ts_model.photo_count(conn) > 0 :
       print "Database has %i entries" % ts_model.photo_count(conn)
    while ts_model.photo_count(conn) < num_images :
        #get the posts
        if url_type == 'blog':
            #print "Get posts %i to %i" % (i*limit+start_offset,(1+i)*limit+start_offset)
            posts = t.get('posts',blog_url=base_url,extra_endpoints='photo', params={'limit':limit, 'offset':i*limit+start_offset})
            posts = posts['posts']
            i = i + 1
        if url_type == 'tag':
            #print "Get posts %i posts before timestamp %i" % (limit,i)
            params = {'limit':limit, 'tag':tag, 'before':i};
            posts = t.get(None,blog_url=None,tag=True, params=params)
        oldi = i
        for p in posts:
          #some posts don't have photo
          if(not('photos' in  p)): continue
          #some posts have more than one image, we will ignore that for now
          if(len(p['photos']) != 1): continue
          #some posts don't have tag
          if(len(p['tags']) == 0): continue
          # If we made it through that, we have a new photo
          n = ts_model.photo_count(conn)
          # we need timestamp
          if tag : i = p['timestamp']
          #print out the info, move to DB later
          tags = [ y.strip().lower() for x in p['tags']
                                     for y in x.split('\n') ]
          url = p['photos'][0]['original_size']['url']
          # if this is slow, switch to batch execute instead
          if 'note_count' in p.keys():
              note_count = p['note_count']
          else:
              note_count = -1
          ts_model.add_tags(c, tags)
          ts_model.add_photo(c, url, note_count, p['timestamp'])
          ts_model.link_tags_photo(c, tags, url)

          if n != ts_model.photo_count(conn) and tag:
            print "@ %i found %s %i (notes=%i): %s %s" % (i, tag, n, note_count, url,
                                                               "#" + " #".join(tags))
          conn.commit()
        # Decrement the timestamp if it didn't change
        if oldi == i: i -= limit
    conn.close()