def get_search_ids(search_url):
    urls = list(all_search_urls(search_url))
    for idx, u in enumerate(urls):
        print('# Gathering ids from page {}/{}'.format(idx + 1, len(urls)))
        ids = get_search_page_interactive_ids(get_page(u))
        for id in ids:
            yield id
def all_search_urls(search_url):
    search = get_page(search_url)
    dropdown = search.xpath(search_pages_dropdown_xp)[0]
    for o in dropdown.value_options:
        u = re.sub(r'&page=(\d+)', '&page=' + o, search_url)
        u = re.sub(r'&resort_page=(\d+)', '&resort_page=' + str(int(o) - 1), u)
        yield u
예제 #3
0
def get_recent_chapters(story_id):
    url = "https://www.writing.com/main/interactive-story/item_id/" + story_id + "/action/recent_chapters"
    page = get_page(url)

    while hasServerRefusal(page):
        page = get_page(url)

    output = {}
    recents = page.xpath(recent_elements_xp)

    url_cutoff = page.xpath(recent_link_xp)[0].rfind("/") + 1
    for recent in recents:
        #the descent
        link = recent.xpath(recent_link_xp)[0][url_cutoff:]
        date = parse_short_date_time(" ".join(recent.xpath(recent_date_xp)))
        output[link] = date

    return output
예제 #4
0
def get_outline(story_id):
    """Gets a list of all possible chapters for scraping"""
    url = "https://www.writing.com/main/interact/item_id/" + story_id + "/action/outline"
    page = get_page(url)

    while hasServerRefusal(page):
        page = get_page(url)

    descents = []
    outline_links = page.xpath(outline_chapters_xpath)

    #Pull the URL and find the last / to cut off the preceeding URL
    url_cutoff = outline_links[0].attrib['href'].rfind("/") + 1

    for a_element in outline_links:
        link = a_element.attrib['href'][url_cutoff:]
        descents.append(link)

    return descents
예제 #5
0
def get_story_info(story_id):
    url = "https://www.writing.com/main/interact/item_id/" + story_id
    page = get_page(url)

    while hasServerRefusal(page):
        page = get_page(url)

    #Private item can't access, can not scrape
    if page.text_content().lower().find('is a private item.') >= 0:
        return -1

    #Deleted item can't acccess, can not scrape
    if page.text_content().lower().find(
            "wasn't found within writing.com") >= 0:
        return False

    try:
        story_info = StoryInfo(
            id=int(page.xpath(story_id_xp)[0]),
            pretty_title=page.xpath(story_title_xp)[0],
            author_id=page.xpath(story_author_link_xp)[0]
            [len("https://www.Writing.Com/main/portfolio/view/"):],
            author_name=page.xpath(story_author_name_xp)[0],
            description=html.tostring(page.xpath(story_description_xp)[0],
                                      encoding="unicode",
                                      with_tail=False),
            brief_description=page.xpath(story_brief_description_xp)[0],
            created=parse_date_time(
                page.xpath(story_created_date)[0] +
                page.xpath(story_created_date)[1]),
            modified=parse_date_time(
                page.xpath(story_modified_date)[0] +
                page.xpath(story_modified_date)[1]),
            image_url=page.xpath(story_image_url_xp)[0],
            last_full_update=None)

    except Exception as e:
        raise e

    return story_info
def get_chapter(url):
    chapter = get_page(url)

    #Did we hit a "heavy server volume" error?
    assertNotServerRefusal(chapter)

    data = ChapterInfo()

    suff = url[url.rfind('/') + 1:]
    if suff == '1' or suff == '0':
        xpathformat = '1'
    else:
        xpathformat = '0'

    def xpath(path):
        return chapter.xpath(path.format(xpathformat))

    #Title
    data.title = encodingBruteForce(xpath(chapter_title_xp)[0])

    #Chapter content

    #Example chapter https://www.writing.com/main/interact/item_id/1924673-Acquiring-Powers-2/map/114331111
    #Hack to convert take the latin-1 string, convert it back to bytes, and decode it as unicode
    chapterText = html.tostring(xpath(chapter_content_xp)[0],
                                encoding="unicode")
    chapterText = encodingBruteForce(chapterText)

    data.content = chapterText

    #Gives 3 results if member isn't deleted, else 1
    #First one is interactive's creator
    #Second is the chapter's author
    #3rd is the chapter's author again but in the copyright link
    author_name = xpath(chapter_member_name_xp)

    if (len(author_name) is 1):
        data.is_author_past = True
    else:
        data.author_name = author_name[1].text_content()
        #Grab the title, split by new line, and cut off 'Username: '******'title'].split("\n")[0][10:]

    #Choices
    choices = xpath(chapter_choices_xp)
    for choice in choices:
        data.choices.append(encodingBruteForce(choice.text_content()))
    return data
def get_story_info(url, award_banner=False):
    story = get_page(url)
    data = StoryInfo()

    assertNotServerRefusal(story)

    formatter = ""
    if award_banner:
        formatter = "1"
    else:
        formatter = "0"

    def xpath(path):
        return story.xpath(path.format(formatter))

    try:
        #pretty title
        data.pretty_title = xpath(story_title_xp)[0]

        #author id
        authorlink = xpath(story_author_id_xp)[0].attrib['href']
        data.author_id = authorlink[authorlink.rfind('/') + 1:]

        data.author_name = xpath(story_author_name_xp)[0]

        #description
        data.description = html.tostring(xpath(story_description_xp)[0],
                                         encoding="unicode",
                                         with_tail=False)

        #brief description
        data.brief_description = xpath(story_brief_description_xp)[0]

        #image url
        data.image_url = xpath(
            '//meta[@property="og:image"]')[0].attrib["content"]

    except Exception as e:
        if award_banner:
            raise e
        else:
            return get_story_info(url, True)

    return data
def get_chapter_list(story_id):
    url = "http://www.writing.com/main/interact/item_id/" + story_id + "/action/outline"
    while True:
        outline = get_page(url)

        #Did we hit a "heavy server volume" error?
        if hasServerRefusal(outline) is False:
            break

    #Decided to re-write this because I had no clue what it was doing before.
    descents = []
    names = []
    outline_links = outline.xpath(outline_chapters_xpath)

    #Links default to https now, but I'm too lazy to change it everywhere, so gonna pull the URL and find the last / to cut off the
    url_cutoff = outline_links[0].attrib['href'].rfind("/") + 1

    for a_element in outline_links:
        link = a_element.attrib['href'][url_cutoff:]
        descents.append(link)
        names.append(encodingBruteForce(a_element.text_content()))

    return descents, names
예제 #9
0
def get_chapter(url):
    page = get_page(url)

    if hasServerRefusal(page):
        raise ServerRefusal('Heavy Server Volume')

    #Error premium mode to inline chapters is on. Disable it and try again
    if len(page.xpath(chapter_content_xp)) == 0:
        get_page(
            "https://www.writing.com/main/my_account?action=set_q_i2&ajax=setDynaOffOn&val=-1"
        )
        page = get_page(url)
        if hasServerRefusal(page):
            raise ServerRefusal('Heavy Server Volume')

    try:
        choices = []
        choice_elements = page.xpath(chapter_choices_xp)
        for choice in choice_elements:
            choices.append(choice.text_content())
        if len(choice_elements) == 0:
            choices = None

        if len(page.xpath(chapter_author_link_xp)) != 0:
            author_id = page.xpath(chapter_author_link_xp)[0][
                len("https://www.Writing.Com/main/portfolio/view/"):]
        else:
            author_id = None

        if len(page.xpath(chapter_author_name_xp)) != 0:
            author_name = page.xpath(chapter_author_name_xp)[0]
        else:
            if len(page.xpath(chapter_author_name_xp_2)) != 0:
                author_name = page.xpath(
                    chapter_author_name_xp_2)[0][3:].strip()

                if author_name == "":
                    author_name = None
            else:
                author_name = None

        if len(page.xpath(chapter_title_xp)) != 0:
            title = page.xpath(chapter_title_xp)[0]
        else:
            title = None

        chapter = Chapter(
            title=title,
            id=int(page.xpath(chapter_id_xp)[0]),
            content=html.tostring(page.xpath(chapter_content_xp)[0],
                                  encoding="unicode"),
            author_id=author_id,
            author_name=author_name,
            choices=choices,
            created=parse_date(page.xpath(chapter_created_date_xp)[0]))
    except Exception as e:
        print("Scraping error at " + url)
        with open('scrapingerror.html', 'w', encoding='utf-8') as o:
            o.write(html.tostring(page))
        raise e

    return chapter