示例#1
0
def get_book_info(book_id, genre):
    book_obj = goodreads.get_object('book', book_id)
    book_info = dict(book_id=book_id)
    dict_fields = ['title', 'image_url', 'publisher',
                     'num_pages', 'link', 'isbn', 'isbn13',
                     'publication_year', 'publication_month', 'publication_day']
    #check if missing information
    hasEmpty = False
    for field in dict_fields:
        if book_obj[field] == None:
            hasEmpty = True
        book_info[field] = book_obj[field]
    work = book_obj['work']
    #check if the work is same as the books
    if work['best_book_id']['#text'] == book_id:
        work_dict_fields = ['ratings_sum', 'ratings_count',
                              'original_publication_year', 'original_publication_month', 'original_publication_day']
        for field in work_dict_fields:
            if '#text' in work[field]:
                book_info[field] = work[field]['#text']
    if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']:
        # update in 2019 -- not working anymore, manually check missing information
        # parse missing data from html page
        # data = parse_html_page(book_info['link'],
        #         book_info['publication_year'],
        #         book_info['publication_month'],
        #         book_info['publication_day'],
        #         book_info['publisher'],
        #         book_info['num_pages'],
        #         book_info['image_url']
        #     )
        # book_info.update(data)
    #no genre info
    if genre == '':
        shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf']))
        if ('non-fiction' in shelves) and ('fiction' in shelves):
            book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction'
        elif ('nonfiction' in shelves) or ('non-fiction' in shelves):
            book_info['genre'] = 'Nonfiction'
        elif ('novels' in shelves) or ('fiction' in shelves):
            book_info['genre'] = 'Fiction'

    return book_info

#load data from the dataset collected manually
start_year = 2019
end_year = 2019
with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f:
    data = []
    for row in csv.DictReader(f):
        print (row['year'], row['book_title'], row['book_id'])
        if int(row['year']) <= start_year and int(row['year']) >= end_year:
            print ('--get book info')
            datum_base = row
            datum = get_book_info(row['book_id'], row['genre'])
            datum_base.update(datum)
            data.append(datum_base)
    print (data)
    goodreads.save_as_csv('book-info', data)
示例#2
0
    #
    print (author_info)
    return author_info

# get author ids
author_ids = []
with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f:
    for row in csv.DictReader(f):
        if not row['author_id'] in author_ids:
            author_ids.append(row['author_id'])
        if row['author2_id'] != '' and not row['author2_id'] in author_ids:
            author_ids.append(row['author2_id'])
# check author_ids parsed ids
already_parsed = []
with open('csv/author-info.csv', newline='', encoding='utf-8', errors='ignore') as f:
    for row in csv.DictReader(f):
        already_parsed.append(row['author_id'])

# get newly added author info
data = []
for id in author_ids:
    if not id in already_parsed:
        print ('--get author info', id)
        if has_manual_data:
            datum = update_author_info(id)
        else:
            datum = get_author_info(id)
        data.append(datum)

goodreads.save_as_csv('author-info', data)
示例#3
0
    #no genre info
    if genre == '':
        shelves = list(
            map(lambda x: x['@name'], book_obj['popular_shelves']['shelf']))
        if ('non-fiction' in shelves) and ('fiction' in shelves):
            book_info['genre'] = 'Nonfiction' if shelves.index(
                'non-fiction') < shelves.index('fiction') else 'Fiction'
        elif ('nonfiction' in shelves) or ('non-fiction' in shelves):
            book_info['genre'] = 'Nonfiction'
        elif ('novels' in shelves) or ('fiction' in shelves):
            book_info['genre'] = 'Fiction'

    return book_info


#load data from the dataset collected manually
start_year = 2018
end_year = 2018
with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f:
    data = []
    for row in csv.DictReader(f):
        print(row['year'], row['book_title'], row['book_id'])
        if int(row['year']) <= start_year and int(row['year']) >= end_year:
            print('--get book info')
            datum_base = row
            datum = get_book_info(row['book_id'], row['genre'])
            datum_base.update(datum)
            data.append(datum_base)
    print(data)
    goodreads.save_as_csv('book-info', data)
    if search_count > 0:
        if search_count == 1:
            r = obj['GoodreadsResponse']['search']['results']['work'][
                'best_book']
        else:
            r = find_best_match(
                obj['GoodreadsResponse']['search']['results']['work'], author,
                title, r)
    if r != '__N/A__':
        book_id = r['id']['#text']
        author_id = r['author']['id']['#text']
    else:
        print('----- no matching author or title id', author, title)
    return dict(book_id=book_id, author_id=author_id)


#load data from the dataset collected manually
start_year = 2018
end_year = 2018
with open('csv/nytimes-best-books.csv', newline='', encoding='latin-1') as f:
    data = []
    for row in csv.DictReader(f):
        print(row['year'], row['author_name'], row['book_title'])
        if int(row['year']) <= start_year and int(row['year']) >= end_year:
            datum_base = row
            datum = search_by_author_and_book(row['book_title'],
                                              row['author_name'])
            datum_base.update(datum)
            data.append(datum_base)
    goodreads.save_as_csv('goodreads-ids', data)
示例#5
0
    if search_count > 0:
        if search_count == 1:
            r = obj['GoodreadsResponse']['search']['results']['work'][
                'best_book']
        else:
            r = find_best_match(
                obj['GoodreadsResponse']['search']['results']['work'], author,
                title, r)
    if r != '__N/A__':
        book_id = r['id']['#text']
        author_id = r['author']['id']['#text']
    else:
        print('----- no matching author or title id', author, title)
    return dict(book_id=book_id, author_id=author_id)


#load data from the dataset collected manually
start_year = 2014
end_year = 2017
with open('data/NYT_fiction.csv', newline='') as f:
    data = []
    for row in csv.DictReader(f):
        print(row['year'], row['author_name'], row['book_title'])
        if int(row['year']) >= start_year and int(row['year']) <= end_year:
            datum_base = row
            datum = search_by_author_and_book(row['book_title'],
                                              row['author_name'])
            datum_base.update(datum)
            data.append(datum_base)
    goodreads.save_as_csv('fiction_goodreads-ids', data)
示例#6
0
    book_id = ""
    author_id = ""
    if search_count > 0:
        if search_count == 1:
            r = obj["GoodreadsResponse"]["search"]["results"]["work"]["best_book"]
        else:
            r = find_best_match(
                obj["GoodreadsResponse"]["search"]["results"]["work"], author, title, r
            )
    if r != "__N/A__":
        book_id = r["id"]["#text"]
        author_id = r["author"]["id"]["#text"]
    else:
        print("----- no matching author or title id", author, title)
    return dict(book_id=book_id, author_id=author_id)


# load data from the dataset collected manually
start_year = 2019
end_year = 2019
with open("csv/nytimes-best-books.csv", newline="", encoding="latin-1") as f:
    data = []
    for row in csv.DictReader(f):
        print(row["year"], row["author_name"], row["book_title"])
        if int(row["year"]) <= start_year and int(row["year"]) >= end_year:
            datum_base = row
            datum = search_by_author_and_book(row["book_title"], row["author_name"])
            datum_base.update(datum)
            data.append(datum_base)
    goodreads.save_as_csv("goodreads-ids", data)