/
igramSubscription.py
234 lines (178 loc) · 8.86 KB
/
igramSubscription.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import os
import json
import urllib2
#import jmsCode # JMS STOMP connection wrapper - needs stomp.py
import datetime
#///////////////////////////////////////////////////////////////////////////////////////////////
#
# Set of functions to handle the update payload from an instagram subscription update POST.
#
# The main() seems a bit convoluted, but it handles the possibility of multiple updates in a
# single POST. And then it handles each media item (photo) returned from the GET call the the
# relevant search endpoint.
#
# It also handles the recording of the next URL, so that each call only gets the most recent
# content that has not been retrieved before. It does that by retrieving either a 'min_id' (in the
# case of the geography) or a 'next_url' (in the case of a tag) and storing this for the next time.
#
# The next URL (from geog and from tag) is stored in a text file named according to the object_id
# for the subscription in the /config directory. The code attempts to open this for every update
# and read the next url. If it can't, it just proceeds in getting all that is available.
#
# Media metadata is either put out over JMS (not tested yet) or dumped straight to a file as JSON.
#
# If on dotcloud, check /var/log/supervisor/uwsgi.log for any print outs/errors.
# Also, note that if deploying on dotcloud you will need a custom build to ensure nginx can
# accept big enough POST payloads.
#
#
#///////////////////////////////////////////////////////////////////////////////////////////////
def getNextUrl(p, object_id):
''' See whether the url to use has already been written to a file '''
outDir = os.path.dirname(p.configFile)
if str(object_id) in os.listdir(outDir):
f = open(os.path.join(outDir, str(object_id)), 'r')
url = f.read()
f.close()
else:
url = None
return url
#------------------------------------------------------------------------------------------------
def formatMetadata(mediaMeta):
''' Retrieves those fields that would usefully be stored in the same format as twitter do it.
This allows downstream processors to (hopefully) handle the data irrespective of source.
It stores the original too, but duplicates fields containing time/geo/hashtags/text. '''
# Get the data list
eventsIn = mediaMeta['data']
eventsOut = []
for data in eventsIn:
# Assign caption text to the 'text' field
try:
data['text'] = data['caption']['text']
except:
data['text'] = None
# Assign the created_time to created_at
dt = datetime.datetime.fromtimestamp(float(data['created_time']))
data['created_at'] = dt.strftime('%a %b %d %H:%M:%S +0000 %Y')
# Deal with entities/tags - put each of the tags into the hashtag structure
entities = {"urls": [],"hashtags": [],"user_mentions": []}
# Loop the tags on the photo, add them to an 'entities' dict
for tag in data['tags']:
try:
entities['hashtags'].append({'text':str(tag), 'indices':[]})
except:
pass
# Add that entities dict to the original data
data['entities'] = entities
# Deal with geolocation information
try:
lat = float(data['location']['latitude'])
lon = float(data['location']['longitude'])
except:
lat, lon = None, None
# Note the switcheroo of the lat/lon between these 2 sets
data["geo"] = {"type": "Point", "coordinates": [lat, lon]}
data["coordinates"] = {"type": "Point", "coordinates": [lon, lat]}
eventsOut.append(data)
# Return the original photo metadata with some fields added (duplicated)
# to ensure they're in the twitter format
return eventsOut
#------------------------------------------------------------------------------------------------
def getMediaUpdates(url):
''' Reads and parses the subscription updates'''
try:
response = urllib2.urlopen(url)
mediaMeta = json.loads(response.read())
except:
mediaMeta = None
print "Failed to open this url: \n %s" %url
return mediaMeta
#------------------------------------------------------------------------------------------------
def handleMediaPagination(p, url, object_id, mediaMeta):
''' Extracts the pagination information relating to the next set of update data'''
nextUrl = None
# See if there is a pagincation key in the media metadata
if mediaMeta and mediaMeta.has_key('pagination'):
pagination = mediaMeta['pagination']
# If it has a next_url, then get that for the next time this gets updated - they tell you what its going to be
if pagination.has_key('next_url') and pagination['next_url'] != None:
nextUrl = pagination['next_url']
# Geography subscriptions, just have a next_min_id, which is used to get the next data.
elif pagination.has_key('next_min_id') and pagination['next_min_id'] != None:
minId = pagination['next_min_id']
# Strip out the base url. Catch the first instance where it shouldn't have an & in it
amp = url.find('&')
if amp != -1:
url = url[:amp+1]
nextUrl = "%s&min_id=%s" %(url, minId)
else:
pass
else:
print "Failed too retrieve either mediaMeta or the pagination key."
# Where we've been successful getting the next url, dump it out to a file for next time
if nextUrl:
try:
outDir = os.path.dirname(p.configFile)
outName = os.path.join(outDir, str(object_id))
fOut = open(outName, 'w')
fOut.write(nextUrl)
fOut.close()
except:
print "Failed to write out next URL for object_id : %s \n %s" %(object_id, nextUrl)
return
#------------------------------------------------------------------------------------------------
def buildUrl(p, obj, objectId):
''' Submits the request to the SEARCH api for the actual media update.
This gets called if the pagination function doesn't get used.
The pagination function gets the 'next' url from the current message,
That url ensures you don't get dupes.'''
# Swap out the geography id
if obj == 'geography':
url = p.geoUrl.replace('<geo-id>', str(objectId))
# Swap out the tag
if obj == 'tag':
url = p.tagUrl.replace('<tag>', str(objectId))
# Sub out the client id for authorisation
url = url.replace('<client-id>', str(p.client))
return url
#------------------------------------------------------------------------------------------------
def main(p, response):
'''Handles the subscription updates, including making the call to the endpoint and dumping to jms/text.'''
# Make the JMS connection via STOMP and the jmsCode class
if p.jmsBase == True:
import jmsCode
jms = jmsCode.jmsHandler(p.jmsHost, p.jmsPort, verbose=True)
jms.connect()
# If the config says save it out to file, do so
if p.fileBase == True:
outDir = os.path.dirname(p.configFile)
# Accepts a list of dictionaries - the update message
updates = json.loads(response)
# Format the url and get the media metadata
for upd in updates:
# Does the next URL already exist for this object?
url = getNextUrl(p, upd['object_id'])
# If the next (ie this) url hasn't been written to a file, build it from the config file
if url == None:
url = buildUrl(p, upd['object'], upd['object_id'])
# Get the media that has changed since the last time
mediaMeta = getMediaUpdates(url)
# Find the pagination info and save out info that concerning next url for this subscription
handleMediaPagination(p, url, upd['object_id'], mediaMeta)
# Format the content - make it like a tweet
data = formatMetadata(mediaMeta)
# Loop each photo that is referenced by this update and either save it out or message it
for photo in data:
# Dump the media metadata out to a string
jPhoto = json.dumps(photo, ensure_ascii=True)
# Write the json for this photo out to file
if p.fileBase == True:
f = open(os.path.join(outDir, str(photo['id'])+'.json'), 'w')
f.write(jPhoto)
f.close()
# Put the metadata onto the JMS
if p.jmsBase == True:
jms.sendData(p.jmsDest, jPhoto, photo['id'])
# Close the jms connection
if p.jmsBase == True:
jms.disConnect()