/
seed.py
364 lines (257 loc) · 12.9 KB
/
seed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
from sqlalchemy import func
from server import app
from datetime import datetime
from os import environ
from model import (connect_db, db, Book, Genre, Author, User, UserBook, BookGenre, BookAuthor)
from sqlalchemy.orm.exc import NoResultFound
from goodreads_oauth import initialize_goodreads_session
import xmltodict, requests
def load_book_info():
"""Load information to book, author and book_authors tables in db."""
for i in range(1, 6):
print "Loading books.."
books = book_api_request('http://www.feedbooks.com/books/top.atom?page=' + str(i))
for book in books:
title, release_year, summary, download_url, book_cover = get_book_metadata(book)
book_added = add_book_metadata(title, release_year, summary, download_url, book_cover)
if book_added != None:
get_author(book, book_added)
get_genre(book, book_added)
def book_api_request(api_url):
"""Make API request, get a response in text format, parse it using xmltodict."""
response = requests.get(api_url)
# Turn response.content (string) into dictionary
metadata = xmltodict.parse(response.text)
# List of 'entry' objects from 'feed'dictionary. Each 'entry' is a book (dict).
books = metadata['feed']['entry']
return books
def get_book_metadata(book):
""" Gets book metadata from API."""
title = book['title']
release_year = book['dcterms:issued']
summary = book['summary'].encode("utf-8")
download_url = book['link'][1]['@href']
book_cover = book['link'][4]['@href']
return [title, release_year, summary, download_url, book_cover]
def add_book_metadata(title, release_year, summary, download_url, book_cover):
"""Creates Book object and adds metadata to db."""
if title.encode('utf-8') not in ("The Public Domain: Enclosing the Commons of the Mind", "Password Incorrect"):
# Creates an instance of the Book class
book_add = Book(title=title,
release_year=release_year,
url=download_url,
book_cover=book_cover,
mini_summary=summary)
# Add/commit book so it gets added to the table.
db.session.add(book_add)
db.session.commit()
return book_add
def get_genre(book, book_add):
""" Gets genre info from book entry."""
#TODO: Refactor and add try/except instead of if not category in book.
if not 'category' in book:
# If there is no category key, continue without adding genre info.
pass
elif isinstance(book['category'], dict):
genre = book['category']['@label']
genre_added = add_genre(genre)
link_genre_book(genre_added, book_add)
else:
book_genres = book['category'] # List of dictionaries
for genre in book_genres:
# Grabs genre from dict
book_genre = genre['@label']
genre_added = add_genre(book_genre)
link_genre_book(genre_added, book_add)
def add_genre(genre_to_add):
""" Creates Genre object, check if it already exists in db & adds it."""
# Creates an instance of the Author class
add_genre = Genre(genre=genre_to_add)
try:
# Check if genre exists & do not add them if so.
check_genre = Genre.query.filter(Genre.genre == genre_to_add).one()
except NoResultFound:
# If we get this error, then genre doesn't exist yet. Add/commit to db
db.session.add(add_genre)
db.session.commit()
check_genre = Genre.query.filter(Genre.genre == genre_to_add).one()
return check_genre
def link_genre_book(genre_added, book_add):
"""Creates a BookGenre object and adds it to db."""
# Creates an instance of the BookGenre class
book_genre_link = BookGenre(book_id=book_add.book_id,
genre_id=genre_added.genre_id)
# Add/commit instance to BookGenre association table.
db.session.add(book_genre_link)
db.session.commit()
def get_author(book, book_add):
""" Gets author info from book."""
if isinstance(book['author'], dict):
author_name = book['author']['name'].encode('utf-8')
author_added = add_author(author_name)
link_author_book(author_added, book_add)
else:
# List of dictionaries
book_authors = book['author']
for author in book_authors:
# Grabs author from dict obj
author_name = author['name'].encode('utf-8')
author_added = add_author(author_name)
link_author_book(author_added, book_add)
def add_author(author_name):
""" Creates Author object, check if it already exists in db & adds it."""
# Creates an instance of the Author class
author = Author(name=author_name)
try:
# Check if author exists & do not add them if so.
check_author = Author.query.filter(Author.name == author_name).one()
except NoResultFound:
# If we get this error, then author doesn't exist yet. Add/commit to db
db.session.add(author)
db.session.commit()
check_author = Author.query.filter(Author.name == author_name).one()
return check_author
def link_author_book(author_added, book_add):
"""Creates a BookAuthor object and adds it to db."""
# Creates an instance of the BookAuthor class
book_author_link = BookAuthor(book_id=book_add.book_id,
author_id=author_added.author_id)
# Add/commit instance to BookAuthor association table.
db.session.add(book_author_link)
db.session.commit()
def get_goodreads_book_id():
"""Query db for book titles, search Goodreads and get their goodreads_id."""
#TODO: Add more comments to the goodreads_id functions.
app_key = environ["KEY"]
all_books = db.session.query(Book).order_by(Book.book_id).all()
for book in all_books:
print "\n~~~~~~~~~~~~~~~~~~~~~~Searching goodreads for book id", book.book_id, ", ", book.title, 'author: ', book.author[0].name.encode('ascii', 'ignore')
payload = {'key': app_key, 'q': book.title}
response = requests.get("https://www.goodreads.com/search/index.xml", params=payload)
search_result = xmltodict.parse(response.text)
all_matches = search_result['GoodreadsResponse']['search']['results']['work']
print 'found', len(all_matches), 'matches'
matchfound = False
for match in all_matches:
print '\tlooking at gooodreads author ', match['best_book']['author']['name'].encode('ascii', 'ignore')
goodreads_lastname = match['best_book']['author']['name'].encode('ascii', 'ignore').split()[-1]
database_lastname = book.author[0].name.encode('ascii', 'ignore').split()[-1]
if goodreads_lastname == database_lastname:
if match['ratings_count']['#text'] > 1000:
goodreads_id = match['best_book']['id']['#text']
add_goodreads_book_id(book.title, goodreads_id)
print "found a suitable match, moving on"
matchfound = True
else:
print '************ ERROR!!!!! found a match but there are only', match['ratings_count']['#text'], 'ratings'
break
# if we got here and no match, :-(
if not matchfound:
print '***************** ERROR!!!!! no match found! boo hoo.'
def load_goodreads_userinfo():
app_key = environ['KEY']
for i in range(1, 153):
print "Loading users.."
get_goodreads_userinfo('https://www.goodreads.com/group/members/40148.xml?sort=last_online&key=' + app_key + '&page=' + str(i))
def get_goodreads_userinfo(api_url):
""" Get user id and name from goodreads db."""
response = requests.get(api_url)
data = xmltodict.parse(response.text) # Turn response.text (string) into dictionary
users = data['GoodreadsResponse']['group_users']['group_user']
for user in users:
goodreads_id = user['user']['id']['#text']
user_name = user['user']['first_name'][:63].encode('latin-1', 'ignore')[:63]
add_userinfo(goodreads_id, user_name)
def add_userinfo(goodreads_id, user_name):
""" Add user info to db."""
add_user = User(name=user_name,
goodreads_id=goodreads_id)
# Add/commit book so it gets added to the table.
db.session.add(add_user)
db.session.commit()
def add_goodreads_book_id(book_title, add_goodreads_id):
""" Add goodreads book_id to books table."""
search_book = Book.query.filter(Book.title == book_title).first()
search_book.goodreads_id = add_goodreads_id
db.session.commit()
def load_user_ratings():
"""Query db for goodreads info on book/users. Locate users w/ ratings for those books & load db."""
all_books = db.session.query(Book.book_id, Book.goodreads_id).all() # list of tuples
all_users = db.session.query(User.user_id, User.goodreads_id).all()
convert_books = dict(all_books) # convert list of tuples to dictionary pairs
convert_users = dict(all_users)
oauth_session = initialize_goodreads_session()
# loop through dictionary, where user = (user_id, goodreads_id). See all_books query for reference
for user in convert_users.iteritems():
# num to be used to go through each page of user reviews in payload for request
for num in range(1, 6):
print "Going through page" + str(num) + "..."
get_user_ratings(oauth_session, user, num, convert_books)
def get_user_ratings(oauth_session, user, num, convert_books):
# unpacking tuple (user_id from db, gr_uid is goodreads_id for that user from db)
user_id, gr_uid = user
payload = {'id': gr_uid, 'page': num, 'per_page': 200, 'v': 2}
response = oauth_session.get('https://www.goodreads.com/review/list.xml', params=payload)
try:
data = xmltodict.parse(response.text)
try:
# List of book objects as dictionaries
books = data['GoodreadsResponse']['reviews']['review']
for book in books:
print "Looping through each book on their page..."
# get goodreads id and rating number (0-5) for book from response
id = int(book['book']['id']['#text'])
rating = int(book['rating'])
# checks if the user has read the book, 0 = not read
if rating > 0:
if id in convert_books.itervalues(): # check if book exists in our db of books
# find key (book_id) using value (goodreads_id) from dict.
book_id = convert_books.keys()[convert_books.values().index(id)]
# turn unicode date into string, split on spaces
# Note that grabbing the date from response turns the time to 00:00:00.
# ex: u'Tue Oct 25 11:23:19 -0700 2016' --> 'Sun Nov 06 00:00:00 -0700 2016'
date = str(book['date_updated']).split()
# reformat date to 'Sun Nov 06 2016' removing 00:00:00 time
date = date[0] + " " + date[1] + " " + date[2] + " " + date[5]
# turn date string to a datetime object
updated_at = datetime.strptime(date, "%a %b %d %Y")
book_read = True
# calls function to create UserBook instance, add/commit to db.
add_rating(user_id, book_id, book_read, rating, updated_at)
except KeyError:
pass
except xml.parsers.expat.ExpatError:
pass
def add_rating(user_id, book_id, book_read, rating, updated_at):
""" Add rating to UserBook table. Links book_id and user_id together."""
# create a UserBook instance to add to db.
add_rating = UserBook(user_id=user_id,
book_id=book_id,
book_read=book_read,
rating=rating,
updated_at=updated_at)
db.session.add(add_rating)
db.session.commit()
def fix_autoinc_value_user_id():
"""Set value for the next user_id after seeding database"""
# Gets the max value in the user_id column from the users table
result = db.session.query(func.max(User.user_id)).one()
max_id = int(result[0])
# Sets the value for the next user_id to be max_id + 1
query = "SELECT setval('users_user_id_seq', :new_id)"
db.session.execute(query, {'new_id': max_id + 1})
db.session.commit()
if __name__ == '__main__':
# Call connect_db function & pass in Flask app
connect_db(app)
# Create tables
db.create_all()
# Call function to load books to db.
# load_book_info()
#Call function to get/load goodreads_id to title db.
# get_goodreads_book_id()
# get_user_reviews()
# extract users from goodreads book and add to db.
# load_goodreads_userinfo()
# Get user ratings for public domain books from specific users and load them to db
load_user_ratings()