def get_search_ids(search_url): urls = list(all_search_urls(search_url)) for idx, u in enumerate(urls): print('# Gathering ids from page {}/{}'.format(idx + 1, len(urls))) ids = get_search_page_interactive_ids(get_page(u)) for id in ids: yield id
def all_search_urls(search_url): search = get_page(search_url) dropdown = search.xpath(search_pages_dropdown_xp)[0] for o in dropdown.value_options: u = re.sub(r'&page=(\d+)', '&page=' + o, search_url) u = re.sub(r'&resort_page=(\d+)', '&resort_page=' + str(int(o) - 1), u) yield u
def get_recent_chapters(story_id): url = "https://www.writing.com/main/interactive-story/item_id/" + story_id + "/action/recent_chapters" page = get_page(url) while hasServerRefusal(page): page = get_page(url) output = {} recents = page.xpath(recent_elements_xp) url_cutoff = page.xpath(recent_link_xp)[0].rfind("/") + 1 for recent in recents: #the descent link = recent.xpath(recent_link_xp)[0][url_cutoff:] date = parse_short_date_time(" ".join(recent.xpath(recent_date_xp))) output[link] = date return output
def get_outline(story_id): """Gets a list of all possible chapters for scraping""" url = "https://www.writing.com/main/interact/item_id/" + story_id + "/action/outline" page = get_page(url) while hasServerRefusal(page): page = get_page(url) descents = [] outline_links = page.xpath(outline_chapters_xpath) #Pull the URL and find the last / to cut off the preceeding URL url_cutoff = outline_links[0].attrib['href'].rfind("/") + 1 for a_element in outline_links: link = a_element.attrib['href'][url_cutoff:] descents.append(link) return descents
def get_story_info(story_id): url = "https://www.writing.com/main/interact/item_id/" + story_id page = get_page(url) while hasServerRefusal(page): page = get_page(url) #Private item can't access, can not scrape if page.text_content().lower().find('is a private item.') >= 0: return -1 #Deleted item can't acccess, can not scrape if page.text_content().lower().find( "wasn't found within writing.com") >= 0: return False try: story_info = StoryInfo( id=int(page.xpath(story_id_xp)[0]), pretty_title=page.xpath(story_title_xp)[0], author_id=page.xpath(story_author_link_xp)[0] [len("https://www.Writing.Com/main/portfolio/view/"):], author_name=page.xpath(story_author_name_xp)[0], description=html.tostring(page.xpath(story_description_xp)[0], encoding="unicode", with_tail=False), brief_description=page.xpath(story_brief_description_xp)[0], created=parse_date_time( page.xpath(story_created_date)[0] + page.xpath(story_created_date)[1]), modified=parse_date_time( page.xpath(story_modified_date)[0] + page.xpath(story_modified_date)[1]), image_url=page.xpath(story_image_url_xp)[0], last_full_update=None) except Exception as e: raise e return story_info
def get_chapter(url): chapter = get_page(url) #Did we hit a "heavy server volume" error? assertNotServerRefusal(chapter) data = ChapterInfo() suff = url[url.rfind('/') + 1:] if suff == '1' or suff == '0': xpathformat = '1' else: xpathformat = '0' def xpath(path): return chapter.xpath(path.format(xpathformat)) #Title data.title = encodingBruteForce(xpath(chapter_title_xp)[0]) #Chapter content #Example chapter https://www.writing.com/main/interact/item_id/1924673-Acquiring-Powers-2/map/114331111 #Hack to convert take the latin-1 string, convert it back to bytes, and decode it as unicode chapterText = html.tostring(xpath(chapter_content_xp)[0], encoding="unicode") chapterText = encodingBruteForce(chapterText) data.content = chapterText #Gives 3 results if member isn't deleted, else 1 #First one is interactive's creator #Second is the chapter's author #3rd is the chapter's author again but in the copyright link author_name = xpath(chapter_member_name_xp) if (len(author_name) is 1): data.is_author_past = True else: data.author_name = author_name[1].text_content() #Grab the title, split by new line, and cut off 'Username: '******'title'].split("\n")[0][10:] #Choices choices = xpath(chapter_choices_xp) for choice in choices: data.choices.append(encodingBruteForce(choice.text_content())) return data
def get_story_info(url, award_banner=False): story = get_page(url) data = StoryInfo() assertNotServerRefusal(story) formatter = "" if award_banner: formatter = "1" else: formatter = "0" def xpath(path): return story.xpath(path.format(formatter)) try: #pretty title data.pretty_title = xpath(story_title_xp)[0] #author id authorlink = xpath(story_author_id_xp)[0].attrib['href'] data.author_id = authorlink[authorlink.rfind('/') + 1:] data.author_name = xpath(story_author_name_xp)[0] #description data.description = html.tostring(xpath(story_description_xp)[0], encoding="unicode", with_tail=False) #brief description data.brief_description = xpath(story_brief_description_xp)[0] #image url data.image_url = xpath( '//meta[@property="og:image"]')[0].attrib["content"] except Exception as e: if award_banner: raise e else: return get_story_info(url, True) return data
def get_chapter_list(story_id): url = "http://www.writing.com/main/interact/item_id/" + story_id + "/action/outline" while True: outline = get_page(url) #Did we hit a "heavy server volume" error? if hasServerRefusal(outline) is False: break #Decided to re-write this because I had no clue what it was doing before. descents = [] names = [] outline_links = outline.xpath(outline_chapters_xpath) #Links default to https now, but I'm too lazy to change it everywhere, so gonna pull the URL and find the last / to cut off the url_cutoff = outline_links[0].attrib['href'].rfind("/") + 1 for a_element in outline_links: link = a_element.attrib['href'][url_cutoff:] descents.append(link) names.append(encodingBruteForce(a_element.text_content())) return descents, names
def get_chapter(url): page = get_page(url) if hasServerRefusal(page): raise ServerRefusal('Heavy Server Volume') #Error premium mode to inline chapters is on. Disable it and try again if len(page.xpath(chapter_content_xp)) == 0: get_page( "https://www.writing.com/main/my_account?action=set_q_i2&ajax=setDynaOffOn&val=-1" ) page = get_page(url) if hasServerRefusal(page): raise ServerRefusal('Heavy Server Volume') try: choices = [] choice_elements = page.xpath(chapter_choices_xp) for choice in choice_elements: choices.append(choice.text_content()) if len(choice_elements) == 0: choices = None if len(page.xpath(chapter_author_link_xp)) != 0: author_id = page.xpath(chapter_author_link_xp)[0][ len("https://www.Writing.Com/main/portfolio/view/"):] else: author_id = None if len(page.xpath(chapter_author_name_xp)) != 0: author_name = page.xpath(chapter_author_name_xp)[0] else: if len(page.xpath(chapter_author_name_xp_2)) != 0: author_name = page.xpath( chapter_author_name_xp_2)[0][3:].strip() if author_name == "": author_name = None else: author_name = None if len(page.xpath(chapter_title_xp)) != 0: title = page.xpath(chapter_title_xp)[0] else: title = None chapter = Chapter( title=title, id=int(page.xpath(chapter_id_xp)[0]), content=html.tostring(page.xpath(chapter_content_xp)[0], encoding="unicode"), author_id=author_id, author_name=author_name, choices=choices, created=parse_date(page.xpath(chapter_created_date_xp)[0])) except Exception as e: print("Scraping error at " + url) with open('scrapingerror.html', 'w', encoding='utf-8') as o: o.write(html.tostring(page)) raise e return chapter