Пример #1
0
def get_urls_from_page(url, configuration=None, normalize=False):

    if not configuration:
        configuration = Configuration(["test1.com"], "EPUB", lightweight=True)

    data = None
    adapter = None
    try:
        adapter = adapters.getAdapter(configuration, url, anyurl=True)

        # special stuff to log into archiveofourown.org, if possible.
        # Unlike most that show the links to 'adult' stories, but protect
        # them, AO3 doesn't even show them if not logged in.  Only works
        # with saved user/pass--not going to prompt for list.
        if 'archiveofourown.org' in url:
            if adapter.getConfig("username"):
                if adapter.getConfig("is_adult"):
                    if '?' in url:
                        addurl = "&view_adult=true"
                    else:
                        addurl = "?view_adult=true"
                else:
                    addurl = ""
                # just to get an authenticity_token.
                data = adapter._fetchUrl(url + addurl)
                # login the session.
                adapter.performLogin(url, data)
                # get the list page with logged in session.

        if 'fimfiction.net' in url and adapter.getConfig("is_adult"):
            data = adapter._fetchUrl(url)
            adapter.set_adult_cookie()

        if 'tthfanfic.org' in url and adapter.getConfig("is_adult"):
            ## Simple fetch works in testing, but actual pages use a
            ## POST and has a 'ctkn' value, so we do too.
            # adapter._fetchUrl("https://www.tthfanfic.org/setmaxrating.php?sitemaxrating=5")
            adapter.setSiteMaxRating(url)

        # this way it uses User-Agent or other special settings.
        data = adapter._fetchUrl(url, usecache=False)
    except UnknownSite:
        # no adapter with anyurl=True, must be a random site.
        opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor())
        data = opener.open(url).read()

    # kludge because I don't see it on enough sites to be worth generalizing yet.
    restrictsearch = None
    if 'scarvesandcoffee.net' in url:
        restrictsearch = ('div', {'id': 'mainpage'})

    return get_urls_from_html(data, url, configuration, normalize,
                              restrictsearch)
Пример #2
0
    def __init__(self, sections, fileform, lightweight=False):
        site = sections[-1]  # first section is site DN.
        ConfigParser.SafeConfigParser.__init__(self)

        self.lightweight = lightweight
        self.use_pagecache = False  # default to false for old adapters.

        self.linenos = dict()  # key by section or section,key -> lineno

        ## [injected] section has even less priority than [defaults]
        self.sectionslist = ['defaults', 'injected']

        ## add other sections (not including site DN) after defaults,
        ## but before site-specific.
        for section in sections[:-1]:
            self.addConfigSection(section)

        if site.startswith("www."):
            sitewith = site
            sitewithout = site.replace("www.", "")
        else:
            sitewith = "www." + site
            sitewithout = site

        self.addConfigSection(sitewith)
        self.addConfigSection(sitewithout)

        if fileform:
            self.addConfigSection(fileform)
            ## add other sections:fileform (not including site DN)
            ## after fileform, but before site-specific:fileform.
            for section in sections[:-1]:
                self.addConfigSection(section + ":" + fileform)
            self.addConfigSection(sitewith + ":" + fileform)
            self.addConfigSection(sitewithout + ":" + fileform)
        self.addConfigSection("overrides")

        self.listTypeEntries = get_valid_list_entries()

        self.validEntries = get_valid_entries()

        self.url_config_set = False

        self.override_sleep = None
        self.cookiejar = self.get_empty_cookiejar()
        self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),
                                      GZipProcessor())

        self.pagecache = self.get_empty_pagecache()
Пример #3
0
def get_urls_from_page(url, configuration=None, normalize=False):

    if not configuration:
        configuration = Configuration("test1.com", "EPUB")

    data = None
    adapter = None
    try:
        adapter = adapters.getAdapter(configuration, url, anyurl=True)

        # special stuff to log into archiveofourown.org, if possible.
        # Unlike most that show the links to 'adult' stories, but protect
        # them, AO3 doesn't even show them if not logged in.  Only works
        # with saved user/pass--not going to prompt for list.
        if 'archiveofourown.org' in url:
            if adapter.getConfig("username"):
                if adapter.getConfig("is_adult"):
                    if '?' in url:
                        addurl = "&view_adult=true"
                    else:
                        addurl = "?view_adult=true"
                else:
                    addurl = ""
                # just to get an authenticity_token.
                data = adapter._fetchUrl(url + addurl)
                # login the session.
                adapter.performLogin(url, data)
                # get the list page with logged in session.

        # this way it uses User-Agent or other special settings.  Only AO3
        # is doing login.
        data = adapter._fetchUrl(url, usecache=False)
    except UnknownSite:
        # no adapter with anyurl=True, must be a random site.
        opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor())
        data = opener.open(url).read()

    # kludge because I don't see it on enough sites to be worth generalizing yet.
    restrictsearch = None
    if 'scarvesandcoffee.net' in url:
        restrictsearch = ('div', {'id': 'mainpage'})

    return get_urls_from_html(data, url, configuration, normalize,
                              restrictsearch)
Пример #4
0
 def set_cookiejar(self, cj):
     self.cookiejar = cj
     saveheaders = self.opener.addheaders
     self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),
                                   GZipProcessor())
     self.opener.addheaders = saveheaders
Пример #5
0
 def __init__(self):
     self.cookiejar = self.get_empty_cookiejar()
     self.opener = urllib2.build_opener(
         urllib2.HTTPCookieProcessor(self.cookiejar), GZipProcessor())