def get_author_info(author_id): author_obj = goodreads.get_object('author', author_id) author_info = dict(author_id=author_id) dict_fields = ['gender', 'born_at', 'died_at'] for field in dict_fields: author_info[field] = author_obj[field] #often there's multiple spaces between first and last name author_info['name'] = re.sub( '\s+', ' ', author_info['name']).strip(); author_info['birth_year'] = None author_info['birth_month'] = None author_info['birth_day'] = None author_info['death_year'] = None author_info['death_month'] = None author_info['death_day'] = None # Update birth info from the web pages if author_obj['born_at'] == None: author_info['born_at'] = parse_html_page(author_info['link'], author_info['born_at']) #show possible birth date from Wikipedia if author_obj['born_at'] == None: search_wiki(author_info['name']) if author_info['born_at'] is not None: bd = author_info['born_at'] author_info['birth_year'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%Y') author_info['birth_month'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%m') author_info['birth_day'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%d') if author_info['died_at'] is not None: dd = author_info['died_at'] author_info['death_year'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%Y') author_info['death_month'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%m') author_info['death_day'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%d') return author_info
def update_author_info(author_id): author_obj = goodreads.get_object('author', author_id) author_info = dict(author_id=author_id) print (author_info) dict_fields = ['name', 'image_url', 'link'] for field in dict_fields: author_info[field] = author_obj[field]
def get_book_info(book_id, genre): book_obj = goodreads.get_object('book', book_id) book_info = dict(book_id=book_id) dict_fields = ['title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn', 'isbn13', 'publication_year', 'publication_month', 'publication_day'] #check if missing information hasEmpty = False for field in dict_fields: if book_obj[field] == None: hasEmpty = True book_info[field] = book_obj[field] work = book_obj['work'] #check if the work is same as the books if work['best_book_id']['#text'] == book_id: work_dict_fields = ['ratings_sum', 'ratings_count', 'original_publication_year', 'original_publication_month', 'original_publication_day'] for field in work_dict_fields: if '#text' in work[field]: book_info[field] = work[field]['#text'] if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']: # update in 2019 -- not working anymore, manually check missing information # parse missing data from html page # data = parse_html_page(book_info['link'], # book_info['publication_year'], # book_info['publication_month'], # book_info['publication_day'], # book_info['publisher'], # book_info['num_pages'], # book_info['image_url'] # ) # book_info.update(data) #no genre info if genre == '': shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf'])) if ('non-fiction' in shelves) and ('fiction' in shelves): book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction' elif ('nonfiction' in shelves) or ('non-fiction' in shelves): book_info['genre'] = 'Nonfiction' elif ('novels' in shelves) or ('fiction' in shelves): book_info['genre'] = 'Fiction' return book_info #load data from the dataset collected manually start_year = 2019 end_year = 2019 with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f: data = [] for row in csv.DictReader(f): print (row['year'], row['book_title'], row['book_id']) if int(row['year']) <= start_year and int(row['year']) >= end_year: print ('--get book info') datum_base = row datum = get_book_info(row['book_id'], row['genre']) datum_base.update(datum) data.append(datum_base) print (data) goodreads.save_as_csv('book-info', data)
def update_author_info(author_id): author_obj = goodreads.get_object('author', author_id) author_info = dict(author_id=author_id) dict_fields = ['name', 'image_url', 'link'] for field in dict_fields: author_info[field] = author_obj[field] #COMMENT this for the first run, updated with manually added birth and death dates update_fields = ['gender', 'birth_year', 'birth_month', 'birth_day', 'death_year', 'death_month', 'death_day'] for field in update_fields: author_info[field] = author_manual[author_id][field] return author_info
def update_author_info(author_id): author_obj = goodreads.get_object('author', author_id) author_info = dict(author_id=author_id) print(author_info) dict_fields = ['name', 'image_url', 'link'] for field in dict_fields: author_info[field] = author_obj[field] #COMMENT following three lines, this is for the first run and the manual addition of birth and death dates # update_fields = ['gender', 'birth_year', 'birth_month', 'birth_day', 'death_year', 'death_month', 'death_day'] # for field in update_fields: # author_info[field] = author_manual[author_id][field] return author_info
def get_book_info(book_id, genre): book_obj = goodreads.get_object('book', book_id) book_info = dict(book_id=book_id) dict_fields = [ 'title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn', 'isbn13', 'publication_year' ] #check if missing information hasEmpty = False for field in dict_fields: if book_obj[field] == None: hasEmpty = True book_info[field] = book_obj[field] work = book_obj['work'] #check if the work is same as the books if work['best_book_id']['#text'] == book_id: work_dict_fields = [ 'ratings_sum', 'ratings_count', 'original_publication_year' ] for field in work_dict_fields: if '#text' in work[field]: book_info[field] = work[field]['#text'] # collect top 5 genres related to this book in a list top5_genres = [] for i in book_obj['popular_shelves']['shelf']: if i['@name'] in keywords: top5_genres.append(i['@name']) if len(top5_genres) == 10: break book_info['genres'] = top5_genres if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info[ 'image_url']: #parse missing data from html page print('---parse html') data = parse_html_page(book_info['link'], book_info['publication_year'], book_info['publisher'], book_info['num_pages'], book_info['image_url']) book_info.update(data) #no genre info #if genre == '': # shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf'])) # if ('non-fiction' in shelves) and ('fiction' in shelves): # book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction' # elif ('nonfiction' in shelves) or ('non-fiction' in shelves): # book_info['genre'] = 'Nonfiction' # elif ('novels' in shelves) or ('fiction' in shelves): # book_info['genre'] = 'Fiction' return book_info
def get_book_info(book_id, genre): book_obj = goodreads.get_object('book', book_id) book_info = dict(book_id=book_id) dict_fields = [ 'title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn', 'isbn13', 'publication_year', 'publication_month', 'publication_day' ] #check if missing information hasEmpty = False for field in dict_fields: if book_obj[field] == None: hasEmpty = True book_info[field] = book_obj[field] work = book_obj['work'] #check if the work is same as the books if work['best_book_id']['#text'] == book_id: work_dict_fields = [ 'ratings_sum', 'ratings_count', 'original_publication_year', 'original_publication_month', 'original_publication_day' ] for field in work_dict_fields: if '#text' in work[field]: book_info[field] = work[field]['#text'] # if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']: # update in 2019 -- not working anymore, manually check missing information # parse missing data from html page # data = parse_html_page(book_info['link'], # book_info['publication_year'], # book_info['publication_month'], # book_info['publication_day'], # book_info['publisher'], # book_info['num_pages'], # book_info['image_url'] # ) # book_info.update(data) #no genre info if genre == '': shelves = list( map(lambda x: x['@name'], book_obj['popular_shelves']['shelf'])) if ('non-fiction' in shelves) and ('fiction' in shelves): book_info['genre'] = 'Nonfiction' if shelves.index( 'non-fiction') < shelves.index('fiction') else 'Fiction' elif ('nonfiction' in shelves) or ('non-fiction' in shelves): book_info['genre'] = 'Nonfiction' elif ('novels' in shelves) or ('fiction' in shelves): book_info['genre'] = 'Fiction' return book_info
def get_author_info(author_id): author_obj = goodreads.get_object('author', author_id) author_info = dict(author_id=author_id) dict_fields = ['name', 'gender', 'born_at', 'died_at', 'link', 'image_url'] for field in dict_fields: author_info[field] = author_obj[field] #often there's multiple spaces between first and last name author_info['name'] = re.sub( '\s+', ' ', author_info['name']).strip() author_info['birth_year'] = None author_info['birth_month'] = None author_info['birth_day'] = None author_info['death_year'] = None author_info['death_month'] = None author_info['death_day'] = None # 2019 update - wikipedia page doesn't work :( # # Update birth info from the web pages # if author_obj['born_at'] == None: # author_info['born_at'] = parse_html_page(author_info['link'], author_info['born_at']) # #show possible birth date from Wikipedia # if author_obj['born_at'] == None: # search_wiki(author_info['name']) # # if author_info['born_at'] is not None: # bd = author_info['born_at'] # author_info['birth_year'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%Y') # author_info['birth_month'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%m') # author_info['birth_day'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%d') # if author_info['died_at'] is not None: # dd = author_info['died_at'] # author_info['death_year'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%Y') # author_info['death_month'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%m') # author_info['death_day'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%d') # print (author_info) return author_info