def get_urls_from_page(url, configuration=None, normalize=False): if not configuration: configuration = Configuration(["test1.com"], "EPUB", lightweight=True) data = None adapter = None try: adapter = adapters.getAdapter(configuration, url, anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl = "" # just to get an authenticity_token. data = adapter._fetchUrl(url + addurl) # login the session. adapter.performLogin(url, data) # get the list page with logged in session. if 'fimfiction.net' in url and adapter.getConfig("is_adult"): data = adapter._fetchUrl(url) adapter.set_adult_cookie() if 'tthfanfic.org' in url and adapter.getConfig("is_adult"): ## Simple fetch works in testing, but actual pages use a ## POST and has a 'ctkn' value, so we do too. # adapter._fetchUrl("https://www.tthfanfic.org/setmaxrating.php?sitemaxrating=5") adapter.setSiteMaxRating(url) # this way it uses User-Agent or other special settings. data = adapter._fetchUrl(url, usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch = None if 'scarvesandcoffee.net' in url: restrictsearch = ('div', {'id': 'mainpage'}) return get_urls_from_html(data, url, configuration, normalize, restrictsearch)
def __init__(self, sections, fileform, lightweight=False): site = sections[-1] # first section is site DN. ConfigParser.SafeConfigParser.__init__(self) self.lightweight = lightweight self.use_pagecache = False # default to false for old adapters. self.linenos = dict() # key by section or section,key -> lineno ## [injected] section has even less priority than [defaults] self.sectionslist = ['defaults', 'injected'] ## add other sections (not including site DN) after defaults, ## but before site-specific. for section in sections[:-1]: self.addConfigSection(section) if site.startswith("www."): sitewith = site sitewithout = site.replace("www.", "") else: sitewith = "www." + site sitewithout = site self.addConfigSection(sitewith) self.addConfigSection(sitewithout) if fileform: self.addConfigSection(fileform) ## add other sections:fileform (not including site DN) ## after fileform, but before site-specific:fileform. for section in sections[:-1]: self.addConfigSection(section + ":" + fileform) self.addConfigSection(sitewith + ":" + fileform) self.addConfigSection(sitewithout + ":" + fileform) self.addConfigSection("overrides") self.listTypeEntries = get_valid_list_entries() self.validEntries = get_valid_entries() self.url_config_set = False self.override_sleep = None self.cookiejar = self.get_empty_cookiejar() self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar), GZipProcessor()) self.pagecache = self.get_empty_pagecache()
def get_urls_from_page(url, configuration=None, normalize=False): if not configuration: configuration = Configuration("test1.com", "EPUB") data = None adapter = None try: adapter = adapters.getAdapter(configuration, url, anyurl=True) # special stuff to log into archiveofourown.org, if possible. # Unlike most that show the links to 'adult' stories, but protect # them, AO3 doesn't even show them if not logged in. Only works # with saved user/pass--not going to prompt for list. if 'archiveofourown.org' in url: if adapter.getConfig("username"): if adapter.getConfig("is_adult"): if '?' in url: addurl = "&view_adult=true" else: addurl = "?view_adult=true" else: addurl = "" # just to get an authenticity_token. data = adapter._fetchUrl(url + addurl) # login the session. adapter.performLogin(url, data) # get the list page with logged in session. # this way it uses User-Agent or other special settings. Only AO3 # is doing login. data = adapter._fetchUrl(url, usecache=False) except UnknownSite: # no adapter with anyurl=True, must be a random site. opener = u2.build_opener(u2.HTTPCookieProcessor(), GZipProcessor()) data = opener.open(url).read() # kludge because I don't see it on enough sites to be worth generalizing yet. restrictsearch = None if 'scarvesandcoffee.net' in url: restrictsearch = ('div', {'id': 'mainpage'}) return get_urls_from_html(data, url, configuration, normalize, restrictsearch)
def set_cookiejar(self, cj): self.cookiejar = cj saveheaders = self.opener.addheaders self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar), GZipProcessor()) self.opener.addheaders = saveheaders
def __init__(self): self.cookiejar = self.get_empty_cookiejar() self.opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(self.cookiejar), GZipProcessor())