def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata("storyId", self.parsedUrl.query.split("=")[1]) if self.parsedUrl.path.split("/")[1] == "wiktt": self.story.addToList("category", "Harry Potter") self.section = "/wiktt/efiction/" self.dateformat = "%m/%d/%Y" else: self.story.addToList("category", "Originals") self.section = "/efiction/" self.dateformat = "%b %d, %Y" # normalized story URL. self._setURL( "http://" + self.getSiteDomain() + self.section + "viewstory.php?sid=" + self.story.getMetadata("storyId") ) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata("siteabbrev", "msq")
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) self.section = self.parsedUrl.path.split('/', )[1] # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/' + self.section + '/viewstory.php?sid=' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'sghp') # If all stories from the site fall into the same category, # the site itself isn't likely to label them as such, so we # do. Can't use extracategories, could be Atlantis or SG-1 if 'atlantis' in self.section: self.story.addToList("category", "Stargate: Atlantis") else: self.story.addToList("category", "Stargate: SG-1") # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y.%m.%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.story.setMetadata('siteabbrev', 'litero') # normalize to first chapter. Not sure if they ever have more than 2 digits. storyId = self.parsedUrl.path.split('/', )[2] # replace later chapters with first chapter but don't remove numbers # from the URL that disambiguate stories with the same title. storyId = re.sub("-ch-?\d\d", "", storyId) self.story.setMetadata('storyId', storyId) ## accept m(mobile)url, but use www. url = re.sub( "^(www|german|spanish|french|dutch|italian|romanian|portuguese|other)\.i", "\1", url) ## strip ?page=... url = re.sub("\?page=.*$", "", url) ## set url self._setURL(url) # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = '%m/%d/%y'
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[3]) # www.dokuga.com has two 'sections', shown in URL as # 'fanfiction' and 'spark' that change how things should be # handled. # http://www.dokuga.com/fanfiction/story/7528/1 # http://www.dokuga.com/spark/story/7299/1 self.section = self.parsedUrl.path.split('/', )[1] # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/' + self.parsedUrl.path.split('/', )[1] + '/story/' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'dkg') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior if 'fanfiction' in self.section: self.dateformat = "%d %b %Y" else: self.dateformat = "%m-%d-%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','mm') self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: if m.group('id'): self.story.setMetadata('storyId',m.group('id')) elif m.group('id2'): self.story.setMetadata('storyId',m.group('id2')) elif m.group('id3'): self.story.setMetadata('storyId',m.group('id2')) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/fanfic/view_st.php/'+self.story.getMetadata('storyId')) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.is_adult = False # normalized story URL. m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. self._setURL('https://' + self.getSiteDomain() + '/archive/' +m.group('cat') + '/' + self.story.getMetadata('storyId') +'.shtml') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) ## each adapter needs to have a unique abbreviation, whih is set here. self.story.setMetadata('siteabbrev', 'fga') # The date format will vary from site to site. # The below website give the list of variables that can be used to formulate the # correct format # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%y" # This site has the entire story on one page, so I am initializing a variable to hold the # soup so that the getChaperText function doesn't have to use bandwidth to get it again. self.html = ''
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) # logger.debug("AdultFanFictionOrgAdapter.__init__ - url='{0}'".format(url)) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) #Setting the 'Zone' for each "Site" self.zone = self.parsedUrl.netloc.split('.')[0] # normalized story URL.(checking self.zone against list # removed--it was redundant w/getAcceptDomains and # getSiteURLPattern both) self._setURL('http://{0}.{1}/story.php?no={2}'.format(self.zone, self.getBaseDomain(), self.story.getMetadata('storyId'))) #self._setURL('http://' + self.zone + '.' + self.getBaseDomain() + '/story.php?no='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. #self.story.setMetadata('siteabbrev',self.getSiteAbbrev()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev',self.zone+'aff') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y-%m-%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','efp') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d/%m/%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3]) # www.dokuga.com has two 'sections', shown in URL as # 'fanfiction' and 'spark' that change how things should be # handled. # http://www.dokuga.com/fanfiction/story/7528/1 # http://www.dokuga.com/spark/story/7299/1 self.section=self.parsedUrl.path.split('/',)[1] # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/'+self.parsedUrl.path.split('/',)[1]+'/story/'+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','dkg') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior if 'fanfiction' in self.section: self.dateformat = "%d %b %Y" else: self.dateformat = "%m-%d-%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = '******' # if left empty, site doesn't return any message at all. self.password = '' self.is_adult = False # get storyId from url # https://inkbunny.net/submissionview.php?id=1342100 --- old style story url # https://inkbunny.net/s/1234567 -- new style story url # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. gets rid of chapter if there, left with chapter index URL nurl = "https://" + self.getSiteDomain( ) + "/s/" + self.story.getMetadata('storyId') self._setURL(nurl) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'ibnet') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y %H:%M" # This is a 1 story/page site, so I'm initializing the soup variable here for the getChapterText Function self.soup = None
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only fiction/1234 self.story.setMetadata( 'storyId', re.match('/fiction/(\d+)(:/.+)?$', self.parsedUrl.path).groups()[0]) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/fiction/' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'rylrdl') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = '%d/%m/%Y %H:%M:%S %p'
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) # normalized story URL. # XXX Most sites don't have the /fanfic part. Replace all to remove it usually. self._setURL('http://' + self.getSiteDomain() + '/fiction/viewstory.php?sid=' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'btf') # XXX # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y" # XXX
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata("storyId", m.group("id")) # normalized story URL. self._setURL( "http://www." + self.getSiteDomain() + "/blog/archive/" + self.story.getMetadata("storyId") + "-" + m.group("name") + "/" ) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata("siteabbrev", "idn") # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %B %Y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) # pommedesang.com has two 'sections', shown in URL as # 'efiction' and 'sds' that change how things should be # handled. # http://pommedesang.com/efiction/viewstory.php?sid=1234 # http://pommedesang.com/sds/viewstory.php?sid=1234 self.section=self.parsedUrl.path.split('/',)[1] # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/'+self.section+'/viewstory.php?sid='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','pmds') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior if 'efiction' in self.section: self.dateformat = "%b %d, %Y" else: self.dateformat = "%m/%d/%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) logger.debug("StoriesOnlineNetAdapter.__init__ - url='%s'" % url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url self.story.setMetadata( 'storyId', self.parsedUrl.path.split('/', )[2].split(':')[0]) if 'storyInfo' in self.story.getMetadata('storyId'): self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/s/' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', self.getSiteAbbrev()) # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y-%m-%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252"] self.story.setMetadata('siteabbrev', 'bdsmgesch') # Replace possible chapter numbering chapterMatch = _REGEX_TRAILING_DIGIT.search(url) if chapterMatch is None: self.maxChapter = 1 else: self.maxChapter = int(chapterMatch.group(1)) # url = re.sub(_REGEX_TRAILING_DIGIT, "1", url) # set storyId self.story.setMetadata( 'storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId')) # normalize URL self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId'))) self.dateformat = '%d. %m %Y - %H:%M'
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[2]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. self._setURL(self.getURLPrefix() + '/' + m.group('tp') + '/' + self.story.getMetadata('storyId') + '/') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'fsb') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y at %I:%M %p"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. self._setURL(self.getURLPrefix() + '/'+m.group('tp')+'/'+self.story.getMetadata('storyId')+'/') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','fsb') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y at %I:%M %p"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) # normalized story URL. if "explicit" in self.parsedUrl.netloc: self._setURL('http://explicit.' + self.getSiteDomain() + '/viewstory.php?sid=' + self.story.getMetadata('storyId')) self.dateformat = "%d/%b/%y" else: self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid=' + self.story.getMetadata('storyId')) self.dateformat = "%d %b %Y" # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'pffa')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8", "iso-8859-1"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.is_adult = False m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. self._setURL('https://' + self.getSiteDomain() + '/' + m.group('category') + '/' + m.group('author') + '/' + self.story.getMetadata('storyId') + '/') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'trekffnet') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) story_id = get_url_path_segments(url)[1] self._setURL(STORY_URL_TEMPLATE % story_id) self.story.setMetadata('storyId', story_id) self.story.setMetadata('siteabbrev', SITE_DOMAIN)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) if self.parsedUrl.path.split('/', )[1] == 'wiktt': self.story.addToList("category", "Harry Potter") self.section = '/wiktt/efiction/' self.dateformat = "%m/%d/%Y" else: self.story.addToList("category", "Originals") self.section = '/efiction/' self.dateformat = "%b %d, %Y" # normalized story URL. self._setURL('http://' + self.getSiteDomain() + self.section + 'viewstory.php?sid=' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'msq')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.setHeader() self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. gets rid of chapter if there, left with ch 1 URL on this site nurl = "http://"+self.getSiteDomain()+"/fanfictions/index.php?act=vie&id="+self.story.getMetadata('storyId') self._setURL(nurl) #argh, this mangles the ampersands I need on metadata['storyUrl'] #will set it this way self.story.setMetadata('storyUrl',nurl,condremoveentities=False) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','bnfnet')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" # normalized story URL. # get story/file and storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/' + m.group('filestory') + '.php?' + m.group('filestory') + '=' + self.story.getMetadata('storyId')) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) self.story.setMetadata('siteabbrev','ressec') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y" # 20 Nov 2005
def __init__(self, config, url): # save for reader processing. self.reader = False self.post_cache = {} self.threadmarks_for_reader = {} #logger.info("init url: "+url) BaseSiteAdapter.__init__(self, config, url) # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: #logger.debug("groupdict:%s"%m.groupdict()) if m.group('anchorpost'): self.story.setMetadata('storyId',m.group('anchorpost')) self._setURL(self.getURLPrefix() + '/posts/'+m.group('anchorpost')+'/') else: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. title = m.group('title') or "" self._setURL(self.getURLPrefix() + '/'+m.group('tp')+'/'+title+self.story.getMetadata('storyId')+'/') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','fsb') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y at %I:%M %p"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) logger.debug( "FanficAuthorsNetAdapter.__init__ - url='{0}'".format(url)) self.decode = ["utf8", "Windows-1252", "iso-8859-1"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[1]) #Setting the 'Zone' for each "Site" self.zone = self.parsedUrl.netloc.replace('.fanficauthors.net', '') # normalized story URL. self._setURL('http://{0}.{1}/{2}/'.format( self.zone, self.getBaseDomain(), self.story.getMetadata('storyId'))) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'ffa') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. gets rid of chapter if there, left with chapter index URL nurl = "http://"+self.getSiteDomain()+"/historias/"+self.story.getMetadata('storyId') self._setURL(nurl) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','potficscom')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. gets rid of chapter if there, left with chapter index URL nurl = "http://" + self.getSiteDomain( ) + "/historias/" + self.story.getMetadata('storyId') self._setURL(nurl) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'potficscom')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False self.full_work_soup = None self.use_full_work_soup = True # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.path.split('/', )[2]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. self._setURL('https://' + self.getSiteDomain() + '/works/' + self.story.getMetadata('storyId')) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'ao3') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y-%b-%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url) self.decode = ["utf8", "Windows-1252"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','litero') # normalize to first chapter. Not sure if they ever have more than 2 digits. storyId = self.parsedUrl.path.split('/',)[2] # replace later chapters with first chapter but don't remove numbers # from the URL that disambiguate stories with the same title. storyId = re.sub("-ch-?\d\d", "", storyId) self.story.setMetadata('storyId', storyId) ## accept m(mobile)url, but use www. url = re.sub("^(www|german|spanish|french|dutch|italian|romanian|portuguese|other)\.i", "\1", url) ## strip ?page=... url = re.sub("\?page=.*$", "", url) ## set url self._setURL(url) # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8", "Windows-1252" ] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only fiction/1234 self.story.setMetadata('storyId',re.match('/fiction/(\d+)(:/.+)?$',self.parsedUrl.path).groups()[0]) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/fiction/'+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','rylrdl') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = '%d/%m/%Y %H:%M:%S %p'
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = '******' # if left empty, site doesn't return any message at all. self.password = '' self.is_adult = False # get storyId from url # https://inkbunny.net/submissionview.php?id=1342100 --- old style story url # https://inkbunny.net/s/1234567 -- new style story url # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. gets rid of chapter if there, left with chapter index URL nurl = "https://"+self.getSiteDomain()+"/s/"+self.story.getMetadata('storyId') self._setURL(nurl) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'ibnet') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y %H:%M" # This is a 1 story/page site, so I'm initializing the soup variable here for the getChapterText Function self.soup = None
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "" self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. self._setURL('https://' + self.getSiteDomain() + '/story/view/'+self.story.getMetadata('storyId')) else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','asnff') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y-%b-%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) self.section=self.parsedUrl.path.split('/',)[1] # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/'+self.section+'/viewstory.php?sid='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','sghp') # If all stories from the site fall into the same category, # the site itself isn't likely to label them as such, so we # do. Can't use extracategories, could be Atlantis or SG-1 if 'atlantis' in self.section: self.story.addToList("category","Stargate: Atlantis") else: self.story.addToList("category","Stargate: SG-1") # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%Y.%m.%d"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','litero') # normalize to first chapter. Not sure if they ever have more than 2 digits. storyId = self.parsedUrl.path.split('/',)[2] # replace later chapters with first chapter but don't remove numbers # from the URL that disambiguate stories with the same title. storyId = re.sub("-ch-?\d\d", "", storyId) self.story.setMetadata('storyId', storyId) ## accept m(mobile)url, but use www. url = re.sub("^(www|german|spanish|french|dutch|italian|romanian|portuguese|other)\.i", "\1", url) ## strip ?page=... url = re.sub("\?page=.*$", "", url) ## set url self._setURL(url) # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata("storyId", self.parsedUrl.query.split("=")[1]) # normalized story URL. # XXX Most sites don't have the /fanfic part. Replace all to remove it usually. self._setURL( "http://" + self.getSiteDomain() + "/fanfics/viewstory.php?sid=" + self.story.getMetadata("storyId") ) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata("siteabbrev", "sjn") # XXX # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y" # XXX
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) # normalized story URL. if "explicit" in self.parsedUrl.netloc: self._setURL('http://explicit.' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) self.dateformat = "%d/%b/%y" else: self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) self.dateformat = "%d %b %Y" # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','pffa')
def __init__(self, config, url): # save for reader processing. self.reader = False self.post_cache = {} #logger.info("init url: "+url) BaseSiteAdapter.__init__(self, config, url) # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: #logger.debug("groupdict:%s"%m.groupdict()) if m.group('anchorpost'): self.story.setMetadata('storyId',m.group('anchorpost')) self._setURL(self.getURLPrefix() + '/posts/'+m.group('anchorpost')+'/') else: self.story.setMetadata('storyId',m.group('id')) # normalized story URL. self._setURL(self.getURLPrefix() + '/'+m.group('tp')+'/'+self.story.getMetadata('storyId')+'/') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','fsb') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y at %I:%M %p"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # Getting the storyId from url - http://www.area52hkh.net/[Folder]/[AuthorID]/[STORYID].php # I'm setting these variables here, because I use them later. self.folder = self.parsedUrl.path.split('/', )[1] self.authorId = self.parsedUrl.path.split('/', )[2] self.storyId = self.parsedUrl.path.split('/', )[3].replace( '.php', '').replace('.htm', '').replace('.html', '') self.extension = self.parsedUrl.path.split('.')[1] self.story.setMetadata('storyId', self.storyId) self.story.setMetadata('authorId', self.authorId) # normalized story URL. self._setURL('http://{0}/{1}/{2}/{3}.{4}'.format( self.getSiteDomain(), self.folder, self.story.getMetadata('authorId'), self.story.getMetadata('storyId'), self.extension)) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'a52hkh') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.is_adult = False # normalized story URL. m = re.match(self.getSiteURLPattern(), url) if m: self.story.setMetadata('storyId', m.group('id')) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/archive/' + m.group('cat') + '/' + self.story.getMetadata('storyId') + '.html') else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) ## each adapter needs to have a unique abbreviation, whih is set here. self.story.setMetadata('siteabbrev', 'bfa') # The date format will vary from site to site. # The below website give the list of variables that can be used to formulate the # correct format # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%y" # This site has the entire story on one page, so I am initializing a variable to hold the # soup so that the getChaperText function doesn't have to use bandwidth to get it again. self.html = ''
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # Getting the storyId from url - http://www.area52hkh.net/[Folder]/[AuthorID]/[STORYID].php # I'm setting these variables here, because I use them later. self.folder = self.parsedUrl.path.split('/',)[1] self.authorId = self.parsedUrl.path.split('/',)[2] self.storyId = self.parsedUrl.path.split('/',)[3].replace('.php','').replace('.htm','').replace('.html','') self.extension = self.parsedUrl.path.split('.')[1] self.story.setMetadata('storyId', self.storyId) self.story.setMetadata('authorId',self.authorId) # normalized story URL. self._setURL('http://{0}/{1}/{2}/{3}.{4}'.format(self.getSiteDomain(), self.folder, self.story.getMetadata('authorId'), self.story.getMetadata('storyId'), self.extension)) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','a52hkh') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) if self.parsedUrl.path.split('/',)[1] == 'wiktt': self.story.addToList("category","Harry Potter") self.section='/wiktt/efiction/' self.dateformat = "%m/%d/%Y" else: self.story.addToList("category","Originals") self.section='/efiction/' self.dateformat = "%b %d, %Y" # normalized story URL. self._setURL('http://' + self.getSiteDomain() + self.section + 'viewstory.php?sid='+self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','msq')
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. self.username = "******" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult = False # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid=' + self.story.getMetadata('storyId')) # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev', 'scacf') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%m/%d/%Y"
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev', 'tolkien') self.dateformat = '%B %d, %Y' self._normalizeURL(url)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','tst1') self.crazystring = u" crazy tests:[bare amp(&) quote(') amp(&) gt(>) lt(<) ATnT(AT&T) pound(£)]" # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) self.username='' self.is_adult=False
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','tolkien') self.dateformat = '%B %d, %Y' self._normalizeURL(url)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev', 'tst1') self.crazystring = u"tests:[bare amp(&) qt(') amp(&) gt(>) lt(<) ATnT(AT&T) L(£) Onna(女)]" # get storyId from url--url validation guarantees query is only sid=1234 self.story.setMetadata('storyId', self.parsedUrl.query.split('=', )[1]) self.username = '' self.is_adult = False
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) url_tokens = self.parsedUrl.path.split('/') story_id = url_tokens[url_tokens.index('story') + 1] self.story.setMetadata('storyId', story_id) self._setURL(self.STORY_URL_TEMPLATE % story_id) self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) query_data = urlparse.parse_qs(self.parsedUrl.query) story_id = query_data['sid'][0] self.story.setMetadata('storyId', story_id) self._setURL(self.VIEW_STORY_URL_TEMPLATE % int(story_id)) self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) query_data = urlparse.parse_qs(self.parsedUrl.query) story_no = query_data['no'][0] self.story.setMetadata('storyId', story_no) self._setURL(self.READ_URL_TEMPLATE % story_no) self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','fw') # get storyId from url--url validation guarantees second part is storyId self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) self.username = "******" self.password = ""
def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','fimficnet') self.story.setMetadata('storyId', self.parsedUrl.path.split('/',)[2]) self._setURL("http://"+self.getSiteDomain()+"/story/"+self.story.getMetadata('storyId')+"/") self.is_adult = False # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%d %b %Y"