Пример #1
0
class FlickrQuery():
	def __init__(self,flickrapikey,flickrSecret):
		self.api_key = flickrapikey
		self.fapi = FlickrAPI(flickrapikey, flickrSecret)
	def searchbyid(self,eventid):
		photolist = []
		query = " prefix  lode: <http://linkedevents.org/ontology/> \n\
		prefix	dc: <http://purl.org/dc/elements/1.1/> \n\
		prefix	ma: <http://www.w3.org/ns/ma-ont#> \n\
		SELECT ?event ?eventTitle ?URI \n\
		WHERE { \n\
		?event dc:title ?eventTitle. \n\
		?photo lode:illustrate ?event. \n\
		?photo ma:locator ?URI. \n\
		FILTER (?event = <http://data.linkedevents.org/event/eventURI>). \n\
		}  \n\
		"
		query = query.replace('eventURI',eventid)
		searchbase = 'http://eventmedia.eurecom.fr/sparql'
		params = urllib.urlencode({"format": "application/sparql-results+json", "query": query})
		f = urllib.urlopen(searchbase + '?' + params)
		results = simplejson.load(f)
		try:
			results = results['results']['bindings']
		except:
			return []
		for result in results:
			url = result['URI']['value']
			photolist.append(url)
		return photolist

	def searchbytitle(self,title,time,eventid):
		photolist = []
		t1 = time
		t2 = t1 + datetime.timedelta(days = 5)
		starttime = t1 + datetime.timedelta(hours = - (t1.hour))

		bReturn = 1
		idx = 1
		while (bReturn ==1):
			try:			
				rsp = self.fapi.photos_search(api_key=self.api_key,
										ispublic="1",
										media="photos",
										per_page="250", 
										page=str(idx),
										min_taken_date = str(starttime),
										max_taken_date = str(t2),									
										text = title.encode('utf-8'),
										extras = 'date_upload, date_taken, owner_name, geo, tags, machine_tags, url_m'
									   )
				idx = idx +1
				self.fapi.testFailure(rsp)
				total_images = rsp.photos[0]['total'];
				null_test = int(total_images);
				
			except:
					null_test = 0
					print sys.exc_info()[0]
					print sys.exc_info()[1]
					print ('Exception encountered while querying title for images\n')
					print type(title),type(title.encode('utf-8'))
			if null_test == 0:
				break
			if null_test >=250*(idx-1):
				bReturn = 1
			else:
				bReturn = 0
				
			tmpdir = os.path.join(gconfig.metadatadir,'querybytitle')
			if not os.path.exists(tmpdir):
				os.makedirs(tmpdir)
			metadata = os.path.join(tmpdir,'%s_%d.xml' % (eventid,idx-1))
			
			data =  parseString(rsp.xml)
			if not os.path.exists(metadata):
				f = open(metadata,'w')		   
				f.write(data.toprettyxml(encoding='UTF-8'))
				f.close() 
			q = data.getElementsByTagName('photo')
			for p in q:
				url = p.getAttribute('url_m')
				if url.find('.jpg')>0:
					photolist.append(url)
		return photolist
	
	def searchbygeo(self,lat,lng,time,eventid):
		photolist = []
		photolist = []
		t1 = time
		t2 = t1 + datetime.timedelta(days = 3)
		starttime = t1 + datetime.timedelta(hours = - (t1.hour))

		bReturn = 1
		idx = 1
		while (bReturn ==1):
			try:			
				rsp = self.fapi.photos_search(api_key=self.api_key,
										ispublic="1",
										media="photos",
										per_page="250", 
										page=str(idx),
										min_taken_date = str(starttime),
										max_taken_date = str(t2),									
										lat  = str(lat),
										lon = str(lng),
										radius = str('0.7'),
										accuracy = '12',
										extras = 'date_upload, date_taken, owner_name, geo, tags, machine_tags, url_m'
									   )
				idx = idx +1
				self.fapi.testFailure(rsp)
				total_images = rsp.photos[0]['total'];
				null_test = int(total_images);
				
			except:
					null_test = 0
					print sys.exc_info()[0]
					print sys.exc_info()[1]
					print ('Exception encountered while querying for images\n')
			if null_test == 0:
				break
			if null_test >=250*(idx-1):
				bReturn = 1
			else:
				bReturn = 0
			tmpdir = os.path.join(gconfig.metadatadir,'querybygeo')
			if not os.path.exists(tmpdir):
				os.makedirs(tmpdir)
			metadata = os.path.join(tmpdir,'%s_%d.xml' % (eventid,idx-1))
			data =  parseString(rsp.xml)
			if not os.path.exists(metadata):
				f = open(metadata,'w')		   
				f.write(data.toprettyxml(encoding='UTF-8'))
				f.close() 
			q = data.getElementsByTagName('photo')
			for p in q:
				url = p.getAttribute('url_m')
				if url.find('.jpg')>0:
					photolist.append(url)
		return photolist
	
	def searchbygeoRadius(self,lat,lng,r,stime,etime, eventid):
		photolist = []
		bReturn = 1
		idx = 1
		while (bReturn ==1):
			try:			
				rsp = self.fapi.photos_search(api_key=self.api_key,
										ispublic="1",
										media="photos",
										per_page="250", 
										page=str(idx),
										min_taken_date = str(stime),
										max_taken_date = str(etime),									
										lat  = str(lat),
										lon = str(lng),
										radius = str(r),
										accuracy = '12',
										extras = 'date_upload, date_taken, owner_name, geo, tags, machine_tags, url_m'
									   )
				idx = idx +1
				self.fapi.testFailure(rsp)
				total_images = rsp.photos[0]['total'];
				null_test = int(total_images);
				
			except:
					null_test = 0
					print sys.exc_info()[0]
					print sys.exc_info()[1]
					print ('Exception encountered while querying for images\n')
			if null_test == 0:
				break
			if null_test >=250*(idx-1):
				bReturn = 1
			else:
				bReturn = 0
			tmpdir = os.path.join(gconfig.metadatadir,'querybygeo')
			if not os.path.exists(tmpdir):
				os.makedirs(tmpdir)
			metadata = os.path.join(tmpdir,'%s_%d.xml' % (eventid,idx-1))
			data =  parseString(rsp.xml)
			if not os.path.exists(metadata):
				f = open(metadata,'w')		   
				f.write(data.toprettyxml(encoding='UTF-8'))
				f.close() 
			q = data.getElementsByTagName('photo')
			for p in q:
				url = p.getAttribute('url_m')
				if url.find('.jpg')>0:
					photolist.append(url)
		return photolist
	
	def outputlist(self,list,id,fname):
		fw = open(fname,'w')
		for url in list:
			fname = url.split('/')[-1]
			fw.write('%s\%s\n' % (id,fname))
		fw.close()

	def geturlbyid(self,id,list):
		photos = {}
		results = []
		for p in list:
			t = p.split('/')[-1]
			t = t.replace('.jpg','')
			photos[t] = p
			
		for idx in id:
			if idx in photos:
				results.append(photos[idx])
		return results
	
	def OutputList(self,listname,lst):
		str = ''
		str += '\n<table align="center"  border="1" cellspacing="1" cellpadding="3" width=800><H2>query by %s</H2><tr>' % listname
		N = len(lst)
		num = 0
		for i in range(0,N):
			img_file = lst[i]		
			str += '\n<td align="center" valign=top width=30><IMG SRC="%s" width=160 border=1 /></td>' % img_file
			num = num +1
			if (num % 8)==0:
				   str +=('</tr>')
		str +=('</table>')
		return str
	
	def OutputJson(self,lst):
		N = len(lst)
		tmp = []
		num = 0
		for i in range(0,N):
			img_file = lst[i]
			tmp.append({'photo':img_file})
		mydict = {}
		mydict['photos'] = tmp
		mydict['number'] = str(len(tmp))
		return mydict
	
	def OutputHtml(self,id,idlist,titlelist,geolist,refinelist):
		file = open(gconfig.outputdir + '/' + '%s.html' % id,'w')
		head='<html><head><title> Media illustration </title></head>'
		file.write('%s\n' % head)
		head='<body BGCOLOR="#FFFFFF"><center><H1>enriching for Event %s</H1><HR HSIZE="50%%"/>' % id
		file.write('%s\n'% head)
		file.write(self.OutputList("machine tag",idlist) )
		file.write(self.OutputList("Geo tag",geolist) )
		file.write(self.OutputList("title + pruning + refine",refinelist) )
		file.close()

	def OutputXML(self,id,idlist,titlelist,geolist,refinelist):
		fname = gconfig.outputdir + '/' + '%s.xml' % id
		setid = set(idlist)
		setgeo = set(geolist) - setid
		settitle = set(titlelist) - setid
		setrefinelist = set(refinelist) - setid
		sets = [setid,setgeo,settitle,setrefinelist]
		tmpinfo = ["query by ID","query by Geo - ID", "query by Title - ID"," Pruning and Refine"]
		doc = Document()
		query = doc.createElement("query")
		query.setAttribute("id",id)
		doc.appendChild(query)
		results = doc.createElement("PhotoSets")
		for tmpset,info in zip(sets,tmpinfo):
			photoset = doc.createElement("photoset")
			photoset.setAttribute('query',info)
			photoset.setAttribute('photoNum',str(len(tmpset)))
			for photo in tmpset:
				ph = doc.createElement("photo")
				ph.setAttribute('url', photo)
				photoset.appendChild(ph)
			results.appendChild(photoset)
		query.appendChild(results)
		f = open(fname, "w")
		f.write(doc.toprettyxml(encoding='UTF-8'))
		f.close()
Пример #2
0
     upper_bound = mintime + timeskip * 20 #upper bound of the upper time limit
     maxtime     = .95 * lower_bound + .05 * upper_bound
 
     print '\nBinary search on time range upper bound' 
     print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
     print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))
 
     keep_going = 6 #search stops after a fixed number of iterations
     while( keep_going > 0 and maxtime < endtime):
     
         try:
             rsp = fapi.photos_search(api_key=flickrAPIKey,
                                     ispublic="1",
                                     media="photos",
                                     per_page="250", 
                                     page="1",
                                     has_geo = "1", #bbox="-180, -90, 180, 90",
                                     text=query_string,
                                     accuracy="6", #6 is region level.  most things seem 10 or better.
                                     min_upload_date=str(mintime),
                                     max_upload_date=str(maxtime))
                                     ##min_taken_date=str(datetime.fromtimestamp(mintime)),
                                     ##max_taken_date=str(datetime.fromtimestamp(maxtime)))
             #we want to catch these failures somehow and keep going.
             time.sleep(1)
             fapi.testFailure(rsp)
             total_images = rsp.photos[0]['total'];
             null_test = int(total_images); #want to make sure this won't crash later on for some reason
             null_test = float(total_images);
     
             print '\nnumimgs: ' + total_images
             print 'mintime: ' + str(mintime) + ' maxtime: ' + str(maxtime) + ' timeskip:  ' + str(maxtime - mintime)
Пример #3
0
        counter = -1
        while( pagenum <= num_visit_pages ):
        #for pagenum in range(1, num_visit_pages + 1):  #page one is searched twice

            print('  page number ' + str(pagenum))

            try:
                print("PAGE")
                print(pagenum)
                # WARNING THIS QUERY HAS TO MATCH THE SEARCH QUERY!!!!
                rsp = fapi.photos_search(api_key=flickrAPIKey,
                                        ispublic="1",
                                        media="photos",
                                        per_page="250",
                                        page=str(pagenum),
                                        has_geo = "0",
                                        text=query_string,
                                        #extras = "tags, original_format, license, geo, date_taken, date_upload, o_dims, views",
                                        #accuracy="6", #6 is region level.
                                        min_upload_date=str(1121832000),#mintime),
                                        max_upload_date=str(1192165200))#maxtime))

                #rsp = fapi.photos_search(api_key=flickrAPIKey,
                 #                   ispublic="1",
                  #                  media="photos",
                   #                 per_page="250",
                    #                page='0', #str(pagenum),
                     #               sort="interestingness-desc",
                      #              has_geo = "0", #bbox="-180, -90, 180, 90",
                       #             text=query_string,
                        #            #accuracy="6", #6 is region level.  most things seem 10 or better.
Пример #4
0
def run_flickr_query_general(query_args, max_photos = 1000, startDate = "1/1/2010", endDate = "31/12/2011"):

    socket.setdefaulttimeout(30)  #30 second time out on sockets before they throw
    #an exception.  I've been having trouble with urllib.urlopen hanging in the 
    #flickr API.  This will show up as exceptions.IOError.

    #the time out needs to be pretty long, it seems, because the flickr servers can be slow
    #to respond to our big searches.

    ###########################################################################
    # Modify this section to reflect your data and specific search 
    ###########################################################################
    # flickr auth information:
    # change these to your flickr api keys and secret

    # make a new FlickrAPI instance
    fapi = FlickrAPI(flickrAPIKey, flickrSecret)

    #print '\n\nquery arguments\n'
    #print query_args
    total_images_queried = 0;


    # number of seconds to skip per query  
    timeskip = 8 * 604800  #one week

    starttime = convertDate(startDate)
    #mintime = convertDate(startDate) 
    endtime = convertDate(endDate)

    
#    maxtime = startime+20*timeskip
    maxtime = endtime
    mintime = endtime-10*timeskip
    timeskip = min(timeskip, endtime-mintime)

    print 'Start time: ' + str(datetime.fromtimestamp(starttime))
    print 'End time: ' + str(datetime.fromtimestamp(endtime))

    #this is the desired number of photos in each block
    desired_photos = min(250, max_photos)
  
    total_image_num = 0

    results = {}
    print 'here'
    print starttime
    print mintime
#    while (maxtime < endtime):
    while (starttime < mintime):

        #new approach - adjust maxtime until we get the desired number of images 
        #within a block. We'll need to keep upper bounds and lower
        #lower bound is well defined (mintime), but upper bound is not. We can't 
        #search all the way from endtime.

#        lower_bound = mintime + 900 #lower bound OF the upper time limit. must be at least 15 minutes or zero results
#        upper_bound = mintime + timeskip * 20 #upper bound of the upper time limit
#        upper_bound = min(upper_bound, endtime)
#        maxtime     = .95 * lower_bound + .05 * upper_bound

        lower_bound = mintime - 20 * timeskip #lower bound OF the upper time limit. must be at least 15 minutes or zero results
        upper_bound = maxtime #upper bound of the upper time limit
        lower_bound = max(lower_bound, starttime)
        mintime     = 0.05 * lower_bound + 0.95 * upper_bound

#        print '\nBinary search on time range upper bound' 
#        print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
#        print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))

        if total_image_num > max_photos:
            print 'Number of photos %d > %d limit.' % (total_image_num, max_photos)
            break

        print 'here'
        keep_going = 6 #search stops after a fixed number of iterations
        while( keep_going > 0 and starttime < mintime):
        #while( keep_going > 0 and maxtime < endtime):
        
            try:
#                print 'Calling api'
                rsp = fapi.photos_search(api_key=flickrAPIKey,
                                        ispublic="1",
                                        media="photos",
                                        per_page="250", 
                                        page="1",
                                        #has_geo = "1", #bbox="-180, -90, 180, 90",
                                        #text=query_string,
                                        #accuracy="6", #6 is region level.  most things seem 10 or better.
                                        min_upload_date=str(mintime),
                                        max_upload_date=str(maxtime),
                                        **query_args)
                                        ##min_taken_date=str(datetime.fromtimestamp(mintime)),
                                        ##max_taken_date=str(datetime.fromtimestamp(maxtime)))
                #we want to catch these failures somehow and keep going.
                os_time.sleep(1)
                fapi.testFailure(rsp)
                print rsp
                total_images = rsp.photos[0]['total'];
                if total_images == '':
                    total_images = '0'
#                print total_images
#                print rsp.photos[0]
                null_test = int(total_images); #want to make sure this won't crash later on for some reason
                null_test = float(total_images);
        
#                print 'numimgs: ' + total_images
#                print 'mintime: ' + str(mintime) + ' maxtime: ' + str(maxtime) + ' timeskip:  ' + str(maxtime - mintime)
            
                if( int(total_images) > desired_photos ):
#                    print 'too many photos in block, increasing mintime'
                    lower_bound = mintime
                    mintime = (upper_bound + mintime) / 2 #midpoint between current value and lower bound.
#                    print 'too many photos in block, reducing maxtime'
#                    upper_bound = maxtime
#                    maxtime = (lower_bound + mintime) / 2 #midpoint between current value and lower bound.
                
                if( int(total_images) < desired_photos):
#                    print 'too few photos in block, reducing mintime'
                    upper_bound = mintime
                    mintime = (lower_bound + mintime) / 2
#                    print 'too few photos in block, increasing maxtime'
#                    lower_bound = maxtime
#                    maxtime = (upper_bound + maxtime) / 2
                
#                print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
 #               print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))
            
                if( int(total_images) > 0): #only if we're not in a degenerate case
                    keep_going = keep_going - 1
                else:
                    upper_bound = upper_bound + timeskip;    
            
            except KeyboardInterrupt:
                print('Keyboard exception while querying for images, exiting\n')
                raise
#            except:
#                print sys.exc_info()[0]
                #print type(inst)     # the exception instance
                #print inst.args      # arguments stored in .args
                #print inst           # __str__ allows args to printed directly
#                print ('Exception encountered while querying for images\n')

        #end of while binary search    
#        print 'finished binary search'

        print 'mintime: ' + str(datetime.fromtimestamp(mintime)) + ' maxtime: ' + str(datetime.fromtimestamp(maxtime)) + ' numimgs: ' + total_images

        i = getattr(rsp,'photos',None)
        if i:
                
            s = 'numimgs: ' + total_images
            print s
            
            current_image_num = 1;
            
            num = int(rsp.photos[0]['pages'])
            s =  'total pages: ' + str(num)
            print s
            
            #only visit 16 pages max, to try and avoid the dreaded duplicate bug
            #16 pages = 4000 images, should be duplicate safe.  Most interesting pictures will be taken.
            
            num_visit_pages = min(16,num)
            
            s = 'visiting only ' + str(num_visit_pages) + ' pages ( up to ' + str(num_visit_pages * 250) + ' images)'
            print s
            
            total_images_queried = total_images_queried + min((num_visit_pages * 250), int(total_images))

            #print 'stopping before page ' + str(int(math.ceil(num/3) + 1)) + '\n'
        
            pagenum = 1;
            while( pagenum <= num_visit_pages ):
            #for pagenum in range(1, num_visit_pages + 1):  #page one is searched twice
                print '  page number ' + str(pagenum)
                try:
                    rsp = fapi.photos_search(api_key=flickrAPIKey,
                                        ispublic="1",
                                        media="photos",
                                        per_page="250", 
                                        page=str(pagenum),
                                        sort="interestingness-desc",
                                        #has_geo = "1", #bbox="-180, -90, 180, 90",
                                        #text=query_string,
                                        #accuracy="6", #6 is region level.  most things seem 10 or better.
                                        extras = "tags, original_format, license, geo, date_taken, date_upload, o_dims, views",
                                        min_upload_date=str(mintime),
                                        max_upload_date=str(maxtime),
                                        **query_args)
                                        ##min_taken_date=str(datetime.fromtimestamp(mintime)),
                                        ##max_taken_date=str(datetime.fromtimestamp(maxtime)))
                    os_time.sleep(1)
                    fapi.testFailure(rsp)
                except KeyboardInterrupt:
                    print('Keyboard exception while querying for images, exiting\n')
                    raise
#                except:
#                    print sys.exc_info()[0]
#                    #print type(inst)     # the exception instance
#                    #print inst.args      # arguments stored in .args
#                    #print inst           # __str__ allows args to printed directly
#                    print ('Exception encountered while querying for images\n')
                else:

                    # and print them
                    k = getattr(rsp,'photos',None)
                    if k:
                        m = getattr(rsp.photos[0],'photo',None)
                        if m:
                            for b in rsp.photos[0].photo:
                                if b!=None:
                                    photo_id = b['id']
                                    photo_data = { }
                                    photo_data['id'] = b['id']
                                    photo_data['secret'] = b['secret']
                                    photo_data['server'] = b['server']
                                    photo_data['farm'] = b['farm']

                                    photo_data['owner'] = b['owner']
                                    photo_data['title'] = b['title']
                                    photo_data['originalsecret'] = b['originalsecret']
                                    photo_data['originalformat'] = b['originalformat']
                                    photo_data['o_height'] = b['o_height']
                                    photo_data['o_width'] = b['o_width']
                                    photo_data['datetaken'] = b['datetaken'].encode("ascii","replace")
                                    photo_data['dateupload'] = b['dateupload'].encode("ascii","replace")
                                    photo_data['tags'] = b['tags'].encode("ascii","replace")
                                    photo_data['license'] = b['license'].encode("ascii","replace")
                                    photo_data['latitude'] = b['latitude'].encode("ascii","replace")
                                    photo_data['longitude'] = b['longitude'].encode("ascii","replace")
                                    photo_data['accuracy'] = b['accuracy'].encode("ascii","replace")
                                    photo_data['views'] = b['views']
                                    photo_data['interestingness'] = (current_image_num, total_images)

                                    results[photo_id] = photo_data

                                    current_image_num = current_image_num + 1;
                                    total_image_num = total_image_num + 1;
                    pagenum = pagenum + 1;  #this is in the else exception block.  It won't increment for a failure.

            #this block is indented such that it will only run if there are no exceptions
            #in the original query.  That means if there are exceptions, mintime won't be incremented
            #and it will try again
#            timeskip = maxtime - mintime #used for initializing next binary search
#            mintime  = maxtime

        timeskip = maxtime - mintime #used for initializing next binary search
        maxtime  = mintime


    return results
Пример #5
0
        counter = -1
        while (pagenum <= num_visit_pages):
            #for pagenum in range(1, num_visit_pages + 1):  #page one is searched twice

            print '  page number ' + str(pagenum)

            try:
                print("PAGE")
                print(pagenum)
                # WARNING THIS QUERY HAS TO MATCH THE SEARCH QUERY!!!!
                rsp = fapi.photos_search(
                    api_key=flickrAPIKey,
                    ispublic="1",
                    media="photos",
                    per_page="250",
                    page=str(pagenum),
                    has_geo="0",
                    text=query_string,
                    #extras = "tags, original_format, license, geo, date_taken, date_upload, o_dims, views",
                    #accuracy="6", #6 is region level.
                    min_upload_date=str(1121832000),  #mintime),
                    max_upload_date=str(1192165200))  #maxtime))

                #rsp = fapi.photos_search(api_key=flickrAPIKey,
                #                   ispublic="1",
                #                  media="photos",
                #                 per_page="250",
                #                page='0', #str(pagenum),
                #               sort="interestingness-desc",
                #              has_geo = "0", #bbox="-180, -90, 180, 90",
                #             text=query_string,
                #            #accuracy="6", #6 is region level.  most things seem 10 or better.
Пример #6
0
def run_flickr_query(query_string, max_photos = 1000, startDate = "1/1/2010", endDate = "31/12/2011"):

    socket.setdefaulttimeout(30)  #30 second time out on sockets before they throw
    #an exception.  I've been having trouble with urllib.urlopen hanging in the 
    #flickr API.  This will show up as exceptions.IOError.

    #the time out needs to be pretty long, it seems, because the flickr servers can be slow
    #to respond to our big searches.

    ###########################################################################
    # Modify this section to reflect your data and specific search 
    ###########################################################################
    # flickr auth information:
    # change these to your flickr api keys and secret

    # make a new FlickrAPI instance
    fapi = FlickrAPI(flickrAPIKey, flickrSecret)

    print '\n\nquery_string is ' + query_string
    total_images_queried = 0;


    # number of seconds to skip per query  
    #timeskip = 62899200 #two years
    timeskip = 604800  #one week
    #timeskip = 172800  #two days
    #timeskip = 86400 #one day
    #timeskip = 3600 #one hour
    #timeskip = 2257 #for resuming previous query

    #mintime = 1121832000 #from im2gps
    #mintime = 1167407788 # resume crash england
    #mintime = 1177828976 #resume crash japan
    #mintime = 1187753798 #resume crash greece
    #mintime = 1171416400 #resume crash WashingtonDC
    mintime = 1287878400 # 10/24/2010
    maxtime = mintime+timeskip
    #endtime = 1192165200  #10/12/2007, at the end of im2gps queries
    endtime = 1351100325 # 10/24/2012

    #this is the desired number of photos in each block
    desired_photos = 250
    print "hola"

    print datetime.fromtimestamp(mintime)
    print datetime.fromtimestamp(endtime)

    total_image_num = 0

    while (maxtime < endtime):

        #new approach - adjust maxtime until we get the desired number of images 
        #within a block. We'll need to keep upper bounds and lower
        #lower bound is well defined (mintime), but upper bound is not. We can't 
        #search all the way from endtime.

        lower_bound = mintime + 900 #lower bound OF the upper time limit. must be at least 15 minutes or zero results
        upper_bound = mintime + timeskip * 20 #upper bound of the upper time limit
        maxtime     = .95 * lower_bound + .05 * upper_bound

        print '\nBinary search on time range upper bound' 
        print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
        print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))

        if total_image_num > max_photos:
            print 'Number of photos %d > %d limit.' % (total_image_num, max_photos)
            break

        keep_going = 6 #search stops after a fixed number of iterations
        while( keep_going > 0 and maxtime < endtime):
        
            try:
                rsp = fapi.photos_search(api_key=flickrAPIKey,
                                        ispublic="1",
                                        media="photos",
                                        per_page="250", 
                                        page="1",
                                        #has_geo = "1", #bbox="-180, -90, 180, 90",
                                        text=query_string,
                                        #accuracy="6", #6 is region level.  most things seem 10 or better.
                                        min_upload_date=str(mintime),
                                        max_upload_date=str(maxtime))
                                        ##min_taken_date=str(datetime.fromtimestamp(mintime)),
                                        ##max_taken_date=str(datetime.fromtimestamp(maxtime)))
                #we want to catch these failures somehow and keep going.
                time.sleep(1)
                fapi.testFailure(rsp)
                total_images = rsp.photos[0]['total'];
                null_test = int(total_images); #want to make sure this won't crash later on for some reason
                null_test = float(total_images);
        
                print '\nnumimgs: ' + total_images
                print 'mintime: ' + str(mintime) + ' maxtime: ' + str(maxtime) + ' timeskip:  ' + str(maxtime - mintime)
            
                if( int(total_images) > desired_photos ):
                    print 'too many photos in block, reducing maxtime'
                    upper_bound = maxtime
                    maxtime = (lower_bound + maxtime) / 2 #midpoint between current value and lower bound.
                
                if( int(total_images) < desired_photos):
                    print 'too few photos in block, increasing maxtime'
                    lower_bound = maxtime
                    maxtime = (upper_bound + maxtime) / 2
                
                print 'Lower bound is ' + str(datetime.fromtimestamp(lower_bound))
                print 'Upper bound is ' + str(datetime.fromtimestamp(upper_bound))
            
                if( int(total_images) > 0): #only if we're not in a degenerate case
                    keep_going = keep_going - 1
                else:
                    upper_bound = upper_bound + timeskip;    
            
            except KeyboardInterrupt:
                print('Keyboard exception while querying for images, exiting\n')
                raise
            except:
                print sys.exc_info()[0]
                #print type(inst)     # the exception instance
                #print inst.args      # arguments stored in .args
                #print inst           # __str__ allows args to printed directly
                print ('Exception encountered while querying for images\n')

        #end of while binary search    
        print 'finished binary search'
        
        s = '\nmintime: ' + str(mintime) + ' maxtime: ' + str(maxtime)
        print s
        out_file.write(s + '\n') 

        i = getattr(rsp,'photos',None)
        if i:
                
            s = 'numimgs: ' + total_images
            print s
            out_file.write(s + '\n')
            
            current_image_num = 1;
            
            num = int(rsp.photos[0]['pages'])
            s =  'total pages: ' + str(num)
            print s
            out_file.write(s + '\n')
            
            #only visit 16 pages max, to try and avoid the dreaded duplicate bug
            #16 pages = 4000 images, should be duplicate safe.  Most interesting pictures will be taken.
            
            num_visit_pages = min(16,num)
            
            s = 'visiting only ' + str(num_visit_pages) + ' pages ( up to ' + str(num_visit_pages * 250) + ' images)'
            print s
            out_file.write(s + '\n')
            
            total_images_queried = total_images_queried + min((num_visit_pages * 250), int(total_images))

            #print 'stopping before page ' + str(int(math.ceil(num/3) + 1)) + '\n'
        
            pagenum = 1;
            while( pagenum <= num_visit_pages ):
            #for pagenum in range(1, num_visit_pages + 1):  #page one is searched twice
                print '  page number ' + str(pagenum)
                try:
                    rsp = fapi.photos_search(api_key=flickrAPIKey,
                                        ispublic="1",
                                        media="photos",
                                        per_page="250", 
                                        page=str(pagenum),
                                        sort="interestingness-desc",
                                        #has_geo = "1", #bbox="-180, -90, 180, 90",
                                        text=query_string,
                                        #accuracy="6", #6 is region level.  most things seem 10 or better.
                                        extras = "tags, original_format, license, geo, date_taken, date_upload, o_dims, views",
                                        min_upload_date=str(mintime),
                                        max_upload_date=str(maxtime))
                                        ##min_taken_date=str(datetime.fromtimestamp(mintime)),
                                        ##max_taken_date=str(datetime.fromtimestamp(maxtime)))
                    time.sleep(1)
                    fapi.testFailure(rsp)
                except KeyboardInterrupt:
                    print('Keyboard exception while querying for images, exiting\n')
                    raise
                except:
                    print sys.exc_info()[0]
                    #print type(inst)     # the exception instance
                    #print inst.args      # arguments stored in .args
                    #print inst           # __str__ allows args to printed directly
                    print ('Exception encountered while querying for images\n')
                else:

                    # and print them
                    k = getattr(rsp,'photos',None)
                    if k:
                        m = getattr(rsp.photos[0],'photo',None)
                        if m:
                            for b in rsp.photos[0].photo:
                                if b!=None:
                                    out_file.write('photo: ' + b['id'] + ' ' + b['secret'] + ' ' + b['server'] + ' ' + b['farm'] + '\n')
                                    out_file.write('owner: ' + b['owner'] + '\n') 
                                    out_file.write('title: ' + b['title'].encode("ascii","replace") + '\n')
                                    
                                    out_file.write('originalsecret: ' + b['originalsecret'] + '\n')
                                    out_file.write('originalformat: ' + b['originalformat'] + '\n')
                                    out_file.write('o_height: ' + b['o_height'] + '\n')
                                    out_file.write('o_width: ' + b['o_width'] + '\n')
                                    out_file.write('datetaken: ' + b['datetaken'].encode("ascii","replace") + '\n')
                                    out_file.write('dateupload: ' + b['dateupload'].encode("ascii","replace") + '\n')
                                    
                                    out_file.write('tags: ' + b['tags'].encode("ascii","replace") + '\n')
                                    
                                    out_file.write('license: ' + b['license'].encode("ascii","replace") + '\n')
                                    out_file.write('latitude: '  + b['latitude'].encode("ascii","replace") + '\n')
                                    out_file.write('longitude: ' + b['longitude'].encode("ascii","replace") + '\n')
                                    out_file.write('accuracy: '  + b['accuracy'].encode("ascii","replace") + '\n')
                                    
                                    out_file.write('views: ' + b['views'] + '\n')
                                    out_file.write('interestingness: ' + str(current_image_num) + ' out of ' + str(total_images) + '\n');
                                    out_file.write('\n')
                                    current_image_num = current_image_num + 1;
                                    total_image_num = total_image_num + 1;
                    pagenum = pagenum + 1;  #this is in the else exception block.  It won't increment for a failure.

            #this block is indented such that it will only run if there are no exceptions
            #in the original query.  That means if there are exceptions, mintime won't be incremented
            #and it will try again
            timeskip = maxtime - mintime #used for initializing next binary search
            mintime  = maxtime

    out_file.write('Total images queried: ' + str(total_images_queried) + '\n')
    out_file.close
    out_file = open(output_filename,'w')

    fapi = FlickrAPI(flickrAPIKey, flickrSecret)

    i = 0

    page_num = range(1,5)

    for i in page_num: 

        page_nu = str(i)

        try:
            rsp = fapi.photos_search(api_key  = flickrAPIKey,
                                     ispublic = "1",
                                     media    = "photos",
                                     per_page = "250", #seems like it is max
                                     page     = page_nu,
                                     text     = query_string)
            time.sleep(1)
            fapi.testFailure(rsp)
            total_images = rsp.photos[0]['total']

            for b in rsp.photos[0].photo:
                if b!=None:
                    out_file.write('photo: ' + b['id'] + ' ' + b['secret'] + ' ' + b['server'] + '\n')
                    out_file.write('owner: ' + b['owner'] + '\n')
                    out_file.write('title: ' + b['title'].encode("ascii","replace") + '\n')
                    out_file.write('tags: ' + b['tags'].encode("ascii","replace") + '\n')


                    out_file.write('\n')
Пример #8
0
    fapi = FlickrAPI(flickrAPIKey, flickrSecret)

    i = 0

    page_num = range(1, 5)

    for i in page_num:

        page_nu = str(i)

        try:
            rsp = fapi.photos_search(
                api_key=flickrAPIKey,
                ispublic="1",
                media="photos",
                per_page="250",  #seems like it is max
                page=page_nu,
                text=query_string)
            time.sleep(1)
            fapi.testFailure(rsp)
            total_images = rsp.photos[0]['total']

            for b in rsp.photos[0].photo:
                if b != None:
                    out_file.write('photo: ' + b['id'] + ' ' + b['secret'] +
                                   ' ' + b['server'] + '\n')
                    out_file.write('owner: ' + b['owner'] + '\n')
                    out_file.write('title: ' +
                                   b['title'].encode("ascii", "replace") +
                                   '\n')