def is_paywall(link): soup = get_soup(link) for p in soup.find_all('p'): p_text = get_clean_text(p) if p_text.startswith('NOTICE: Unfortunately'): return True return False
def get_pages_titles(index_pages, books_list, title_set=None): book_pages = [] titles = [] for page_link in sorted(index_pages): # get book pages soup_page = get_soup(page_link) book_links = soup_page.find_all( 'p') # 1 p element has 1 or more book listings for b in book_links: for elem in b.find_all('a'): href = elem.get('href') basename = href.rsplit('/', 1)[-1] if href.endswith( '.asp' ) and 'notes' not in basename and 'first.asp' != basename: title = get_clean_text(elem) title = title.replace('Downloadable/Printable Version', '') title = title.replace(' ', '') if not title or title == 'Quotes' or title == 'Quotations' or title.startswith( 'Read the'): continue if title_set and title not in title_set: continue book_pages.append(href) titles.append(title) book_pages = get_absolute_links(book_pages, books_list) return book_pages, titles
def process_paragraphs(ps): paragraphs = [] for p in ps: if not p: continue para = get_clean_text(p, strip=False).strip() if para == 'Notes': break if not para or any([para.startswith(x) for x in BAD_STARTS]) or p.name == 'b': continue a = p.find('a') if a and a.get('href'): continue if p.find('i'): i_text = p.find('i').get_text(strip=True) if len(i_text) >= .9 * len(para): continue b_text = None b = p.find('b') if b: b_text = b.get_text(strip=True) if b_text == ', ': b_text = None elif b_text == 'Om': # fix for http://www.pinkmonkey.com/booknotes/monkeynotes/pmSiddhartha20.asp para = para.replace('Om', ' Om') b_text = None else: b_text = b_text if p.name == 'h4' or b_text: break # reached another section's paragraphs para = para.replace('', "'") # replace weird apostrophe paragraphs.append(para) return paragraphs
def get_author(soup): # written_by = soup.find(class_='subnav__writtenby') # try: # return written_by.find('a').text.strip() # except AttributeError as e: # print(e, 'in get_author') # return '' written_by = soup.find(class_='TitleHeader_authorLink') or soup.find( class_='TitleHeader_authorName') return get_clean_text(written_by)
def get_title_url_map(books_list, title_set=None): soup = get_soup(books_list, sleep=SLEEP) # book_links = soup.find('table', class_='views-table cols-2').find_all('a') book_links = soup.find('table', class_='cols-2').find_all('a') title_url_map = {} for link in book_links: title = get_clean_text(link).replace(' Study Guide', '') if title_set and title not in title_set: continue link = link.get('href') title_url_map[title] = urllib.parse.urljoin(books_list, link) return title_url_map
def get_title_url_map(books_list, title_set=None): soup = get_soup(books_list) columns = soup.find_all('table', width=None)[1].find_all('table') title_url_map = {} for column in columns: cells = column.find_all('tr') for cell in cells: p = cell.find('p') entries = p.find_all('a') for entry in entries: title = get_clean_text(entry) if title_set and title not in title_set: continue href = entry.get('href') title_url_map[title] = urllib.parse.urljoin(books_list, href) return title_url_map
def process_paragraphs(ps): paragraphs = [] for p in ps: if not p: continue para = get_clean_text(p) if para == 'Interpretation': break if not para: continue # if p.find('i') and p.find('i').get_text(strip=True): # continue if p.find('b') and p.find('b').get_text(strip=True): break # reached another section's paragraphs paragraphs.append(para) return paragraphs
def process_plot(link): plot_summ = [] soup = get_soup(link, sleep=SLEEP) content = soup.find('div', id='content-content') paras = content.find_all('p') for p in paras: text = get_clean_text(p, strip=False) bold = p.find(['b', 'strong']) if bold: if bold.get_text() == 'Analysis': break sibs = list(bold.next_siblings) if sibs: text = str(sibs[-1]) else: continue if p and not text.startswith('Log in'): plot_summ.append(text) return plot_summ
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): # iterate through books if title in done: continue if sleep: time.sleep(sleep) if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) author = '' # TODO: figure this out soup = get_soup(url) contents = soup.find('table', id='Table56') if contents: idx = 3 else: contents = soup.find('table', width='99%') idx = 4 if not contents: print('table of contents not found on ', url) continue cells = contents.find('tbody').find_all( 'tr', recursive=False)[idx].find_all('a') cells = [x for x in cells if num_in(get_clean_text(x))] if not cells: print('no chapters found for ', url) continue sects = [] for c in cells: # iterate through sections text = get_clean_text(c) if 'Interpretation' in text: continue href = c['href'] link_summ = urllib.parse.urljoin(url, href) if archived: if '/' not in href: orig_url = urllib.parse.urljoin(get_orig_url(url), href) else: orig_url = get_orig_url(href) link_summ = get_archived(orig_url, update_old) paras = process_chapter(link_summ) if not paras: print('no summaries found on ', link_summ) continue text = standardize_section_titles(text) sects.append((text, paras)) book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=None, source='bookwolf', section_summaries=sects) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from bookwolf'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def process_next_link(link, archived, update_old): soup = get_soup(link) chapters = find_all_stripped('a', soup, RE_CHAP) if 'pmEthanFrome' in link: chapters += soup.find_all('a', text=RE_OPEN) elif 'pmDubliners' in link: h3s = soup.find_all('h3') for h3 in h3s: if h3.text.startswith('Short Story'): chapters = h3.find_next_sibling('p').find_all('a') elif 'wutherg' in link: if chapters[-3]['href'] != 'wutherg47.asp': chapters[-3]['href'] = 'wutherg47.asp' elif 'pmJungle' in link: if chapters[3]['href'] != 'pmJungle20.asp': chapters[3]['href'] = 'pmJungle20.asp' if chapters[9]['href'] != 'pmJungle31.asp': chapters[9]['href'] = 'pmJungle31.asp' if not chapters: return None section_summs = [] url_title_map = {} seen_urls = set() for c in chapters: href = c.get('href') title = get_clean_text(c) title = title if 'pmBabbitt' not in link else '' url = urllib.parse.urljoin(link, href) orig_url = url if 'dpbolvw' in url: continue dead_links1 = set(['pmVanity']) dead_links2 = set([ 'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood', 'pmOfHuman' ]) dead_links3 = set(['pmDeerSlayer', 'pmTypee']) is_dead1 = any(x in orig_url for x in dead_links1) is_dead2 = any(x in orig_url for x in dead_links2) is_dead3 = any(x in orig_url for x in dead_links3) if is_dead1 or is_dead2 or is_dead3: # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead # likewise for other strings page_no = int(re.findall('\d+', orig_url)[-1]) if is_dead1 and page_no >= 17: continue elif is_dead2 and page_no >= 16: continue elif is_dead3 and page_no >= 13: continue if orig_url in seen_urls: continue if archived: orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href')) url = get_archived(orig_url, update_old) url_title_map[url] = title seen_urls.add(orig_url) for url, title in url_title_map.items(): summs = process_story(url, title) for summ in summs: # print(' ', summ[0]) if summ[1]: # not empty text section_summs.append(summ) # manual fixes extra_sections = [] if 'pmWinesburg' in link: extra_sections = [ "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp" ] elif 'pmDubliners' in link: extra_sections = [ "pmDubliners12.asp", "pmDubliners16.asp" ] # pmDubliners57.asp has no "Summary" heading, so skip if extra_sections: if archived: links_addtl = [ get_archived(urllib.parse.urljoin(get_orig_url(link), href), update_old) for href in extra_sections ] else: links_addtl = [ urllib.parse.urljoin(link, x) for x in extra_sections ] sect_summs_addtl = [process_story(x) for x in links_addtl] sect_summs_addtl = [x[0] for x in sect_summs_addtl] section_summs.extend(sect_summs_addtl) return section_summs
def process_story(link, title=None, get_next=True, find_continued=False): """ returns tuples of (title, summary list) format """ soup = get_soup(link) chapters = [] if find_continued: lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM_CONTINUED) if not lines: return [] ### specific edge cases elif 'WhiteFang' in link: lines = find_all_stripped( ['p', 'h4'], soup, RE_CHAP) + find_all_stripped(['p', 'h4'], soup, RE_SUMM) elif 'Ulysses' in link: lines = find_all_stripped('p', soup, RE_SUMM_3) elif 'pmKidnapped16' in link: find_all_stripped(['p', 'h4'], soup, RE_SUMM)[0].extract() lines = find_all_stripped(['p', 'h4'], soup, RE_CHAP) ### else: lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM) or find_all_stripped(['p', 'h4'], soup, RE_SUMM_2) or \ find_all_stripped(['p', 'h4'], soup, RE_CHAP) lines = [ x for x in lines if (x.find('b') and x.find('b').get_text(strip=True)) or x.name == 'h4' ] # line should be bold if not lines or 'barrons/house' in link: lines.extend(find_all_stripped(['p', 'h4'], soup, RE_NUMDOT)) if not lines: print(' cannot find section titles on', link) return [] if 'pmFrankenstein10' in link: lines = lines[1:] frank_cond = 'pmFrankenstein' in link and not any( get_clean_text(lines[0]).startswith(x) for x in ('Summary', 'LETTER')) if 'barrons/heartdk' in link or frank_cond: lines = [lines[0].find_next('p')] for line in lines: if len(lines) > 1 or not title: title_ = line if not re.match( RE_SUMM, get_clean_text(line)) else line.find_previous('p') title_ = get_clean_text(title_) else: title_ = title if 'pmIdiot' in link or 'pmSecretSharer' in link: ps = line.find_all_next(['p', 'b']) elif 'wutherg' in link or 'Ulysses' in link: ps = [] indiv_strs = [] for sib in line.next_siblings: if sib.name == 'p': if indiv_strs: p = element.Tag(name='p') p.string = ' '.join(indiv_strs) ps.append(p) indiv_strs = [] ps.append(sib) elif isinstance(sib, element.NavigableString) and \ not (sib.endswith("Barron's Booknotes\n") or sib.startswith("MonkeyNotes")): indiv_strs.append(sib) if indiv_strs: p = element.Tag(name='p') p.string = ' '.join(indiv_strs) ps.append(p) else: ps = line.find_all_next(['p', 'h4']) paragraphs = process_paragraphs(ps) chapters.append((title_, paragraphs)) if 'junglex' in link: # this should be moved to manual_fix_individual() assert chapters[3][0] == 'CHAPTER 17' assert chapters[7][0] == 'CHAPTER 18' clean_scene = lambda x: re.sub('SCENE \d', '', x, 1) chapter17 = [ *chapters[3][1], clean_scene(chapters[4][0]), *chapters[4][1], clean_scene(chapters[5][0]), *chapters[5][1], clean_scene(chapters[6][0]) ] del chapters[6] del chapters[5] del chapters[4] chapters[3] = (chapters[3][0], chapter17) if get_next and chapters: # check next page if is continued next_elem = soup.find('a', text=RE_NEXT) if not next_elem: pass else: next_link = urllib.parse.urljoin(link, next_elem['href']) chapters2 = process_story(next_link, get_next=get_next, find_continued=True) if not chapters2: pass elif len(chapters2) == 1: title1, paragraphs1 = chapters.pop(-1) title2, paragraphs2 = chapters2[0] chapters.append((title1, paragraphs1 + paragraphs2)) return chapters
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): title = title.replace("DeerSlayer", 'Deerslayer', 1) if title in done: continue if sleep: time.sleep(sleep) author = '' # TODO: figure this out archived_local = archived if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) soup = get_soup(url, sleep=SLEEP) table = soup.find('div', id='block-booknavigation-3') or soup.find( 'div', id='block-block-4') # process plot summary plot_summ = None plot_cell = table.find('a', href=RE_PLOT_LINK) if plot_cell: plot_title = plot_cell.get_text() href = plot_cell['href'] if archived: plot_link = get_orig_url(href) plot_link = get_archived(plot_link, update_old) if 'archive.org' not in plot_link: # failed to retrieve archived version # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted time.sleep(5.0) archived_local = False else: plot_link = urllib.parse.urljoin(url, href) if 'Chapter' not in plot_title: plot_summ = process_plot(plot_link) if not plot_summ: print(' no plot summary found', plot_link) # process section summaries cells = table.find_all('a', href=RE_SUMM_LINK) if title == "The Brothers Karamazov": cells = sort_cells(cells) section_summs = [] if not cells: print(' no section links found for', url) continue seen_sects = set() for c in cells: section_title = get_clean_text(c) section_title_chap = section_title.rsplit(':', 1)[-1] if section_title_chap in seen_sects: print(' seen {} already, skipped'.format(section_title_chap)) continue if re.match(RE_PLOT, section_title): continue if archived and archived_local: link_summ = get_orig_url(c['href']) link_summ = get_archived(link_summ, update_old) else: link_summ = urllib.parse.urljoin(url, c['href']) try: page_summs = process_story(link_summ) except AttributeError: # page failed to load, try again print(' retrying after 5 seconds...') time.sleep(5.0) page_summs = process_story(link_summ) if page_summs: section_summs.extend(page_summs) seen_sects.add(section_title_chap) if not section_summs: print(' could not find summaries for {}'.format(title)) continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=plot_summ, source='novelguide', section_summaries=section_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from novelguide'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def process_story(link, title=None): link = link.replace('http://www.novelguide.com', 'https://www.novelguide.com', 1) chapters = [] soup = get_soup(link, sleep=SLEEP) if 'mansfield-park/' in link or 'jude-the-obscure' in link: content = soup.find('div', class_='content clear-block') paras = content.find_all(['p', 'strong', 'div'])[2:] else: content = soup.find('div', id='content-content') paras = content.find_all('p') if link.endswith('the-adventures-of-tom-sawyer/novel-summary'): initial = paras[1].children.__next__() initial.insert_before(paras[0]) sect_summ = [] title = get_title(soup) break_found = False write = True if 'ivan-fyodorovich' in link: # this page from The Brothers Karamazov is different from the others texts = [p.text for p in paras] summs = colon_section(texts, title) summs[9] = (summs[9][0], summs[9][1][:-7]) chapters.extend(summs) else: for p in paras: text = get_clean_text(p, strip=False).strip() if not text or text.startswith('Log in'): continue br = p.find_all('br') if any(x in link for x in NONBOLD_WITH_SECTIONS): texts = list(p.stripped_strings) chapters.extend(other_section(texts, title)) elif any(x in link for x in set([ 'ulysses', 'siddhartha', 'awakening', 'brothers-karamazov', 'tess-of', 'the-ambass', 'jekyll', 'heart-of-darkness', 'winesburg' ])): texts = list(p.stripped_strings) chapters.extend(other_section(texts, title, always_write=True)) elif any(x in link for x in set(['monte-cristo'])): texts = list(p.stripped_strings) chapters.extend(colon_section(texts, title)) elif (len(br) > 3 or re.match(RE_CHAP_OPEN, p.get_text()) or any(x in link for x in BREAK_TITLES)) and \ 'fathers-and-sons' not in link and 'hound' not in link: break_found = True chapters.extend(process_chapters(p, title)) title = list(p.stripped_strings)[0] else: # for sections where the text is in multiple <p> tags if text == 'advertisement' and not 'the-awakening' in link: break elif text == 'advertisement': continue bold = p if p.name == 'strong' else p.find(['b', 'strong']) if bold: write = True bold_text = bold.get_text(strip=True) is_summ = re.match(RE_PLOT, bold_text) if any(bold_text.startswith(x) for x in ANALYSIS): write = False if sect_summ: chapters.append((title, sect_summ)) sect_summ = [] continue elif not is_summ: if sect_summ: chapters.append((title, sect_summ)) title = bold_text if not is_summ else title sect_summ = [] sibs = list(bold.next_siblings) if write and sibs: sibs = [x.strip() for x in sibs if isinstance(x, str)] text = ' '.join(sibs).strip() sect_summ.append(text) elif text == 'Analysis': write = False continue else: if write: sect_summ.append(text) if not break_found and sect_summ: chapters.append((title, sect_summ)) for i, chapter in enumerate(chapters): norm = [unicodedata.normalize("NFKD", p).strip() for p in chapter[1]] norm = [x for x in norm if x] chapters[i] = (chapters[i][0], norm) return chapters