예제 #1
0
def v0_fic_all(urlId: str) -> Any:
    fics = Fic.select({'urlId': urlId})
    if len(fics) != 1:
        return Err.urlId_not_found.get()
    fic = fics[0]
    if fic.chapterCount is None:
        print(f'err: fic has no chapter count: {fic.id}')
        return Err.urlId_not_found.get()
    ficChapters = {
        fc.chapterId: fc
        for fc in FicChapter.select({'ficId': fic.id})
    }
    chapters = {}
    for cid in range(1, fic.chapterCount + 1):
        if cid not in ficChapters:
            return Err.cid_not_found.get({'arg': f'{fic.id}/{cid}'})
        chapter = ficChapters[cid]
        cres = chapter.toJSONable()
        try:
            content = cres['content']
            if content is not None:
                content = util.decompress(content)
                content = scrape.decodeRequest(content, f'{fic.id}/{cid}')
                content = cleanHtml(content)
                if content != cleanHtml(content):
                    print(
                        f'v0_fic_all: {fic.id}/{cid} did not round-trip through cleanHtml'
                    )
            cres['content'] = content
            chapters[cid] = cres
        except:
            pass

    res = fic.toJSONable()
    return Err.ok({'info': res, 'chapters': chapters})
예제 #2
0
	def softScrape(self, chapter: FicChapter) -> str:
		fic = chapter.getFic()

		curl = self.constructUrl(fic.localId, chapter.chapterId, None)
		#util.logMessage(f'FictionPressAdapter.scrape: {curl}')
		url = scrape.getLastUrlLike(curl)
		delay: float = 5
		if url is None:
			url = curl

		data = str(skitter.softScrape(url)['raw'])

		if data is None:
			raise Exception('unable to scrape? FIXME')
		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			ts = scrape.getMostRecentScrapeTime(url)
			if ts is None:
				raise Exception('no most recent scrape time? FIXME')
			# if we last scraped more than half an hour ago rescrape
			if int(time.time()) - ts > (60 * 30):
				url = self.constructUrl(fic.localId, chapter.chapterId, None)
				data = self.scrape(url)['raw']
		if data is None:
			raise Exception('unable to scrape? FIXME')

		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			raise Exception('unable to find chapter content {}'.format(url))

		return data
예제 #3
0
def importFic(fdata):
	global ficImportRename
	ofic = inflateObject(fdata.copy(), ficImportRename)

	fic = Fic.new()
	for field in ofic:
		print('setting "{}" to "{}"'.format(field, ofic[field]))
		fic.__dict__[field] = ofic[field]

	fic.published = util.parseDateAsUnix(fic.published, int(time.time()))
	fic.updated = util.parseDateAsUnix(fic.updated, int(time.time()))
	print('setting "{}" to "{}"'.format('published', fic.published))
	print('setting "{}" to "{}"'.format('updated', fic.updated))

	print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId))

	fic.insert()

	for fandom in fdata['fandoms']:
		print('  adding fandom "{}"'.format(fandom))
		fic.add(Fandom.define(fandom))
	for character in fdata['characters']:
		print(
			'  adding character "{}" from fandom "{}"'.format(
				character['name'], character['fandom']
			)
		)
		fic.add(
			Character.define(Fandom.define(character['fandom']), character['name'])
		)
	for genre in fdata['genres']:
		print('  adding genre "{}"'.format(genre))
		fic.add(Genre.define(genre))
	for tag in fdata['tags']:
		print('  adding tag "{}"'.format(tag))
		fic.add(Tag.define(tag))

	cids = [int(cid) for cid in fdata['chapters']]
	cids.sort()
	for cid in cids:
		print('  adding chapter {}'.format(cid))
		ochap = fdata['chapters'][str(cid)]
		chapter = FicChapter.new()
		chapter.fic = fic
		chapter.ficId = fic.id
		chapter.chapterId = cid
		for field in ochap:
			chapter.__dict__[field] = ochap[field]
		contentPath = './content/{}/{}/{}/content.html'.format(
			fic.type, fic.localId, cid
		)
		if os.path.isfile(contentPath):
			html = None
			with open(contentPath, 'r') as f:
				html = f.read()
			print('    has content: {}'.format(len(html)))
			chapter.setHtml(html)
		chapter.insert()
예제 #4
0
	def softScrape(self, chapter: FicChapter) -> str:
		if chapter.url is None:
			chapter.url = self.buildUrl(chapter)  # type: ignore
			chapter.localChapterId = str(chapter.chapterId)
			chapter.upsert()
		fic = chapter.getFic()

		# TODO should we be passing '%' instead of chapter.fic.title ?
		#url = scrape.getLastUrlLikeOrDefault(
		#		(self.constructUrl(fic.localId, chapter.chapterId, None),
		#		self.constructUrl(fic.localId, chapter.chapterId, fic.title)))
		curl = self.constructUrl(fic.localId, chapter.chapterId, None)
		#util.logMessage(f'FFNAdapter.scrape: {curl}')
		url = scrape.getLastUrlLike(curl)
		if url is None:
			url = curl

		data = str(skitter.softScrape(url)['raw'])

		if data is None:
			raise Exception('unable to scrape? FIXME')
		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			ts = scrape.getMostRecentScrapeTime(url)
			if ts is None:
				raise Exception('no most recent scrape time? FIXME')
			# if we last scraped more than half an hour ago rescrape
			if int(time.time()) - ts > (60 * 30):
				url = self.constructUrl(fic.localId, chapter.chapterId, None)
				data = self.scrape(url)['raw']
		if data is None:
			raise Exception('unable to scrape? FIXME')

		if (
			data.lower().find('chapter not found.') != -1
			and data.lower().find("id='storytext'") == -1
		):
			raise Exception('unable to find chapter content {}'.format(url))

		return data
예제 #5
0
    def softScrape(self, chapter: FicChapter) -> Optional[str]:
        import scrape
        html = scrape.softScrape(chapter.url)
        if html is None:
            return html
        # TODO well this is a nightmare...
        if html.find('You are being redirected') < 0:
            return html

        import re
        match = re.search("window.location = ['\"]([^'\"]*)['\"];", html)
        if match is None or match.group(1) is None:
            return html

        if chapter.url == match.group(1):
            raise Exception('redirect loop')

        chapter.url = match.group(1)
        chapter.upsert()
        return self.softScrape(chapter)
예제 #6
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		# by default, we simply try to look up the url in existing chapters or fics
		chaps = FicChapter.select({'url': url})
		if len(chaps) == 1:
			fic = Fic.get((chaps[0].ficId, ))
			if fic is not None:
				return FicId(
					FicType(fic.sourceId), fic.localId, chaps[0].chapterId, False
				)

		fics = Fic.select({'url': url})
		if len(fics) == 1:
			return FicId(FicType(fics[0].sourceId), fics[0].localId)

		raise NotImplementedError()
예제 #7
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		if not url.startswith(self.baseUrl):
			return None

		# by default, we simply try to look up the url in existing chapters or fics
		chaps = FicChapter.select({'url': url})
		if len(chaps) == 1:
			fic = Fic.get((chaps[0].ficId, ))
			if fic is not None:
				ftype = FicType(fic.sourceId)
				return FicId(ftype, fic.localId, chaps[0].chapterId, False)

		fics = Fic.select({'url': url})
		if len(fics) == 1:
			ftype = FicType(fics[0].sourceId)
			return FicId(ftype, fics[0].localId)

		leftover = url[len(self.baseUrl):]
		if not leftover.endswith('.html'):
			return None

		ps = leftover.split('/')
		if len(ps) != 3 or ps[0] != 'authors':
			return None

		author = ps[1]
		storyId = ps[2]
		suffixes = ['01a.html', '.html']
		for suffix in suffixes:
			if storyId.endswith(suffix):
				storyId = storyId[:-len(suffix)]

		# note: seems to be safe to lowercase these
		lid = (author + '/' + storyId).lower()
		#print(lid)
		# make lid author/story ?

		# TODO: we need some sort of local lid mapping...
		raise NotImplementedError()
예제 #8
0
def dumpDB():
	data = {}

	fandomMap = {f.id: f for f in Fandom.select()}
	characterMap = {c.id: c for c in Character.select()}
	genreMap = {g.id: g for g in Genre.select()}
	tagMap = {t.id: t for t in Tag.select()}

	data['fandoms'] = [fandomMap[k].name for k in fandomMap]
	data['characters'] = [
		{
			'name': characterMap[k].name,
			'fandom': fandomMap[characterMap[k].fandom_id].name
		} for k in characterMap
	]
	data['genres'] = [genreMap[k].name for k in genreMap]
	data['tags'] = [tagMap[k].name for k in tagMap]

	data['fics'] = {}

	frename = {'id': None, 'chapters': 'chapterCount'}
	crename = {
		'id': None,
		'ficId': None,
		'cid': None,
		'raw': None,
		'fic': None,
		'lastLine': None
	}
	cdefaults = {
		'line': 0,
		'subLine': 0,
		'notes': None,
		'status': Status.ongoing,
		'fetched': None,
		'url': None
	}

	fics = Fic.select()
	for fic in fics:
		k = '{}/{}'.format(fic.type, fic.localId)
		o = fic.__dict__.copy()
		o = deflateObject(o, frename)

		o['fandoms'] = [f.name for f in fic.fandoms()]
		o['characters'] = [
			{
				'name': c.name,
				'fandom': fandomMap[c.fandom_id].name
			} for c in fic.characters()
		]
		o['tags'] = [t.name for t in fic.tags()]
		o['genres'] = [g.name for g in fic.genres()]

		co = {}
		ficChapters = FicChapter.select({'ficId': fic.id})
		for chapter in ficChapters:
			here = chapter.__dict__.copy()
			ffNetUrl = 'https://www.fanfiction.net/s/{}/{}/{}'.format(
				fic.localId, chapter.chapterId, util.urlTitle(fic.title)
			)
			cdefaults['url'] = ffNetUrl
			cdefaults['lastModified'] = here['fetched']
			here = deflateObject(here, crename, cdefaults)

			co[chapter.chapterId] = here
			if chapter.raw is None:
				continue

			contentPath = './content/{}/{}/{}/'.format(
				fic.type, fic.localId, chapter.chapterId
			)
			if not os.path.isdir(contentPath):
				os.makedirs(contentPath)
			with open(contentPath + 'content.html', 'w') as f:
				f.write(chapter.content())

		o['chapters'] = co

		data['fics'][k] = o

	return data
예제 #9
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		mapPrefixes = ['http://www.', 'http://', 'https://www.']
		hasPrefix = True
		while hasPrefix:
			hasPrefix = False
			for pref in mapPrefixes:
				if url.startswith(pref):
					hasPrefix = True
					url = 'https://' + url[len(pref):]

		endsToStrip = [
			'#main',
			'#work_endnotes',
			'#bookmark-form',
			'?view_adult=true',
			'?view_full_work=true',
			'?viewfullwork=true',
			'?show_comments=true',
		]
		for send in endsToStrip:
			if url.endswith(send):
				url = url[:-len(send)]
		if url.find('#') >= 0:
			url = url[:url.find('#')]
		if url.find('?') >= 0:
			url = url[:url.find('?')]

		# TODO: this should probably return a FicId pointing to this chapter and
		# not just this fic in general...
		if url.find('/chapters/') >= 0 and url.find('/works/') < 0:
			meta = scrape.softScrapeWithMeta(url, delay=10)
			if meta is None or meta['raw'] is None or meta['status'] != 200:
				raise Exception('unable to lookup chapter: {}'.format(url))
			from bs4 import BeautifulSoup  # type: ignore
			soup = BeautifulSoup(meta['raw'], 'html5lib')
			for a in soup.find_all('a'):
				if a.get_text() == 'Entire Work':
					return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):])
			else:
				raise Exception('unable to lookup chapters entire work: {}'.format(url))

		if url.startswith(self.collectionUrl) and url.find('/works/') != -1:
			url = self.baseUrl + url[url.find('/works/') + len('/works/'):]
		if not url.startswith(self.baseUrl):
			return None

		pieces = url[len(self.baseUrl):].split('/')
		lid = pieces[0]
		if len(lid) < 1 or not lid.isnumeric():
			return None

		ficId = FicId(FicType.ao3, lid)
		fic = Fic.tryLoad(ficId)
		if fic is None:
			return ficId

		if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric():
			localChapterId = pieces[2]
			mchaps = FicChapter.select(
				{
					'ficId': fic.id,
					'localChapterId': localChapterId
				}
			)
			if len(mchaps) == 1:
				ficId.chapterId = mchaps[0].chapterId
				ficId.ambiguous = False

		return ficId
예제 #10
0
 def buildUrl(self, chapter: FicChapter) -> str:
     if len(chapter.url.strip()) > 0:
         return chapter.url
     return self.constructUrl(chapter.getFic().localId, chapter.chapterId)
예제 #11
0
    def __init__(self,
                 chapter: FicChapter,
                 header: bool = True,
                 markdown: bool = True,
                 footer: bool = True):
        self.chapter = chapter

        fic = chapter.getFic()
        extraTitle = ''.join([s[0] for s in (fic.title or '').split()]).lower()
        # FIXME this handles symbols badly like "Foo (complete)"
        #util.logMessage(f"using {extraTitle} as extraTitle in ChapterView")

        content = self.chapter.cachedContent()
        if content is None:
            raise Exception('missing content? FIXME')
        contentView = HtmlView(content,
                               markdown=markdown,
                               extraTitles=[extraTitle, f"-{extraTitle}-"])
        self.totalWords = sum([len(l.split()) for l in contentView.text])

        self.text: List[Union[str, List[str]]] = []

        if header == True:
            descriptionView = HtmlView(fic.description
                                       or '{missing description}')
            self.text += [['', '"{}"'.format(fic.title), ''],
                          ['', 'by {}'.format(fic.getAuthorName()), ''],
                          [
                              'chapter {}'.format(chapter.chapterId),
                              'words: {}'.format(
                                  util.formatNumber(self.totalWords))
                          ]]
            if chapter.chapterId >= 1:
                self.text += descriptionView.text
            if chapter.title is not None and len(chapter.title) > 0:
                self.text += [[
                    '', 'Chapter {}: {}'.format(chapter.chapterId,
                                                chapter.title), ''
                ]]
            if len(contentView.text) > 0 and contentView.text[0] != '<hr />':
                self.text += ['<hr />']

        self.headerLength = len(self.text)

        self.text += contentView.text
        if footer == True:
            if len(contentView.text) > 0 and contentView.text[-1] != '<hr />':
                self.text += ['<hr />']
            if chapter.title is not None and len(chapter.title) > 0:
                self.text += [[
                    '', 'Chapter {}: {}'.format(chapter.chapterId,
                                                chapter.title), ''
                ]]
            self.text += [
                [
                    'chapter {}'.format(chapter.chapterId),
                    'words: {}'.format(util.formatNumber(self.totalWords))
                ],
                [
                    '"{}"'.format(fic.title),
                    'by {}'.format(fic.getAuthorName())
                ],
            ]

        self.cumulativeLength = [0] * len(self.text)
        cumLen = 0
        for idx in range(len(self.text)):
            self.cumulativeLength[idx] = cumLen
            cumLen += len(self.text[idx])

        self.totalLength = cumLen

        self.preWrap = ' ' * 1
        self.postWrap = ''

        self.wtext: Dict[int, List[str]] = {}
        self.width: int = -1

        self.totalWrappedLines: int = -1
        self.cumulativeTotalWrappedLines: List[int] = []
        self.wrap(80)