def mangapanda(url, download_chapters): html = get_html(url) global last series = title(re.search('<h1.*?>\\s*(.*?)\\s*</h1>', html, re.DOTALL|re.MULTILINE).group(1)).rpartition(' Manga')[0] status = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1) tags = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html, re.DOTALL|re.MULTILINE): match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j) num = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1))) name = match.group(3) link = 'http://www.mangapanda.com' + re.search('<a\\s*href=\"(/.*?)\">', j).group(1) date = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j) date = '{:04}-{:02}-{:02}'.format(int(date.group(3)), int(date.group(1)), int(date.group(2))) if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) links = ['http://www.mangareader.net' + i for i in re.findall('<option value=\"(.*?)\".*?>\\d+</option>', chap_html)] chapters.append({'name':name, 'links':links, 'backup_links':links, 'date':date, 'pages':len(links), 'num':num}) if chapters: function_name(chapters, series, tags, author, status)
def mungeQA(html, type, fields, model, data, col): """html, where LaTeX parts are replaced by some HTML. see _imgLink docstring regarding the rules for LaTeX media. keyword arguments: html -- the text in which to find the LaTeX to be replaced. type -- not used. "q" or "a" for question and answer fields -- not used. A dictionnary containing Tags, Type(model name), Deck, Subdeck(part after last ::), card: template name... TODO (see collection._renderQA for more info) model -- the model in which is compiled the note. It deals with the header/footer, and the image file format data -- not used. [cid, nid, mid, did, ord, tags, flds] col -- the current collection. It deals with media folder """ for match in regexps['standard'].finditer(html): html = html.replace(match.group(), _imgLink(col, match.group(1), model)) for match in regexps['expression'].finditer(html): html = html.replace(match.group(), _imgLink( col, "$" + match.group(1) + "$", model)) for match in regexps['math'].finditer(html): html = html.replace(match.group(), _imgLink( col, "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model)) return html
def insert_line_numbers_in_html(html, line_no_from): """Function that inserts line numbers in the highlighted HTML code. Parameters --------- html : :class:`str` html string of highlighted code. line_no_from : :class:`int` Defines the first line's number in the line count. Returns ------- :class:`str` The generated html string with having line numbers. """ match = re.search("(<pre[^>]*>)(.*)(</pre>)", html, re.DOTALL) if not match: return html pre_open = match.group(1) pre = match.group(2) pre_close = match.group(3) html = html.replace(pre_close, "</pre></td></tr></table>") numbers = range(line_no_from, line_no_from + pre.count("\n") + 1) format = "%" + str(len(str(numbers[-1]))) + "i" lines = "\n".join(format % i for i in numbers) html = html.replace( pre_open, "<table><tr><td>" + pre_open + lines + "</pre></td><td>" + pre_open) return html
def mungeQA( html: str, type: Optional[str], fields: Optional[Dict[str, str]], model: NoteType, data: Optional[List[Union[int, str]]], col, ) -> Any: "Convert TEXT with embedded latex tags to image links." for match in regexps["standard"].finditer(html): html = html.replace(match.group(), _imgLink(col, match.group(1), model)) for match in regexps["expression"].finditer(html): html = html.replace(match.group(), _imgLink(col, "$" + match.group(1) + "$", model)) for match in regexps["math"].finditer(html): html = html.replace( match.group(), _imgLink( col, "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model, ), ) return html
def trim(html, prefix_url=None): """ 코멘트 제거, 자바스크립트 제거 (100.daum.net 제외) \r\n -> \n html에 포함된 <br>, <p>를 \n 으로 변환 다수의 공백, \t, \n 을 하나로 합침 :param html: :param prefix_url: :return: """ html = html.replace('\r\n', '\n') convert_dic = {'<br>': '\n', '<br/>': '\n', '<br />': '\n', '<p>': '\n', '<p/>': '\n', '<p />': '\n', '<BR>': '\n', '<BR/>': '\n', '<BR />': '\n', '<P>': '\n', '<P/>': '\n', '<P />': '\n'} for _from, _to in convert_dic.items(): html = html.replace(_from, _to) html = HtmlUtil.remove_comments_in_html(html) # remove html comments. doc = lxml.html.document_fromstring(html) # convert to html element.r if prefix_url: doc.make_links_absolute(prefix_url) # convert links to absolute links. if prefix_url: if '100.daum.net' not in prefix_url: # javascript를 지우면 일부가 안 보이는 HTML도 있다. (100.daum.net) doc = HtmlUtil.remove_javascripts_in_doc(doc) # remove javascript elements. else: doc = HtmlUtil.remove_javascripts_in_doc(doc) # remove javascript elements. html = lxml.html.tostring(doc, encoding='utf8', include_meta_content_type=True) # convert to html string. html = html.decode('utf8') # bytes -> string html = StringUtil.merge(html) # replace multiple blanks to one blank. return html.strip()
def mungeQA( html: str, type: str, fields: Dict[str, str], model: NoteType, data: QAData, col: anki.storage._Collection, ) -> str: "Convert TEXT with embedded latex tags to image links." for match in regexps["standard"].finditer(html): html = html.replace(match.group(), _imgLink(col, match.group(1), model)) for match in regexps["expression"].finditer(html): html = html.replace( match.group(), _imgLink(col, "$" + match.group(1) + "$", model) ) for match in regexps["math"].finditer(html): html = html.replace( match.group(), _imgLink( col, "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model, ), ) return html
def get_page_from_html(cls, html: str) -> _Element: first_letter_regex = '<span class="firstLetter">.<\/span>' first_letter_spans = re.findall(first_letter_regex, html) for first_letter_span in first_letter_spans: letter = first_letter_span[26:-7] html = html.replace(first_letter_span, letter, 1) quoted_text_regex = '<span class="gathaQuote">.+?(?=>)>' quoted_text_spans = re.findall(quoted_text_regex, html) for quoted_text_span in quoted_text_spans: quoted_text = quoted_text_span[25:-7] html = html.replace(quoted_text_span, quoted_text, 1) bold_text_regex = '<span class="bold">.+?(?=>)>' bold_text_spans = re.findall(bold_text_regex, html) for bold_text_span in bold_text_spans: #print(html) bold_text = bold_text_span[19:-7] html = html.replace(bold_text_span, bold_text, 1) #print(html) return fromstring(html.replace("<br>", "<br/>"))
def mangapanda(url, download_chapters): html = get_html(url) global last series = title( re.search('<h1.*?>\\s*(.*?)\\s*</h1>', html, re.DOTALL | re.MULTILINE).group(1)).rpartition(' Manga')[0] status = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1) tags = re.findall( '<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html, re.DOTALL | re.MULTILINE): match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j) num = float(match.group(1)) name = match.group(3) link = 'http://www.mangapanda.com' + re.search( '<a\\s*href=\"(/.*?)\">', j).group(1) date = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j) date = '{:04}-{:02}-{:02}'.format(int(date.group(3)), int(date.group(1)), int(date.group(2))) if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) links = [ 'http://www.mangareader.net' + i for i in re.findall( '<option value=\"(.*?)\".*?>\\d+</option>', chap_html) ] chapters.append({ 'name': name, 'links': links, 'backup_links': links, 'date': date, 'pages': len(links), 'num': num }) if chapters: function_name(chapters, series, tags, author, status)
def on_actionPrint_triggered(self): # Let the user select the desired printer via the system printer list printer = QPrinter() dialog = QPrintDialog(printer) if not dialog.exec_(): return mainFrame = self.preview.page().mainFrame() childFrames = mainFrame.childFrames() # Workaround for Qt versions < 4.8.0 printWholeSite = True if hasattr(QWebView, 'selectedHtml'): if self.preview.hasSelection(): printWholeSite = False # use whole frame if no content is selected or selecting html is not # supported if printWholeSite: # set 'content' frame active as printing an inactive web frame # doesn't work properly if len(childFrames) >= 2: childFrames[1].setFocus() # thanks to setFocus, we can get the print the frame # with evaluated javascript html = childFrames[1].toHtml() else: html = self.preview.selectedHtml() # construct head head = '<head>' # extract head from child frames for frame in childFrames: headEl = frame.findFirstElement('head') head += headEl.toInnerXml() head += '</head>' # concat new head and selection # the result may be invalid html; needs improvements! html = head + html # prepend a header to the log book html.replace('</head>', '</head><h1>NICOS Log book</h1>') # let qt layout the content doc = QTextDocument() doc.setHtml(html) doc.print_(printer)
def mungeQA(html, type, fields, model, data, col): "Convert TEXT with embedded latex tags to image links." for match in regexps['standard'].finditer(html): html = html.replace(match.group(), _imgLink(col, match.group(1), model)) for match in regexps['expression'].finditer(html): html = html.replace(match.group(), _imgLink( col, "$" + match.group(1) + "$", model)) for match in regexps['math'].finditer(html): html = html.replace(match.group(), _imgLink( col, "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model)) return html
def insert_line_numbers(html): match = re.search('(<pre[^>]*>)(.*)(</pre>)', html, re.DOTALL) if not match: return html pre_open = match.group(1) pre = match.group(2) pre_close = match.group(3) html = html.replace(pre_close, '</pre></td></tr></table>') numbers = range(1, pre.count('\n') + 1) format = '%' + str(len(str(numbers[-1]))) + 'i' lines = '\n'.join(format % i for i in numbers) html = html.replace(pre_open, '<table><tr><td>' + pre_open + lines + '</pre></td><td>' + pre_open) return html
def strip_tags(html): spc = spec_characters.findall(html) for sp in spc: html = html.replace(sp, "") links = link_re.findall(html) for link in links: html = html.replace(link[0], "") bbcode = bbcode_re.findall(html) for code in bbcode: html = html.replace(code[0], "") soup = BeautifulSoup(html, "lxml") text = soup.getText() words = words_re.findall(text) return words
def quest_tooltip(req): quest_id = req.GET.get("id", 0) nocache = req.GET.get("nocache", "False") == "True" res_type = req.GET.get("type", "web") print("quest_id:{}".format(quest_id)) try: if quest_id: if res_type == "web": r = requests.get( "https://cdn.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D" .format(quest_id)) r_json = r.json() print(r_json) html = r_json["parse"]["text"]["*"] html = html.replace("class=\"tooltip-item\"", "class=\"tooltip-item\" id=\"tooltip\"", 1) html = html.replace("href=\"/", "href=\"https://ff14.huijiwiki.com/") return ren2res("quest_tooltip.html", req, {"parsed_html": html}) elif res_type == "img" or res_type == "image": return HttpResponse("TODO", status=500) from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument('--kiosk') options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=options) driver.get("https://xn--v9x.net/quest/tooltip/?id={}".format( quest_id)) tooltip = driver.find_element_by_id("tooltip") valid_image = "tooltip.png" if tooltip.screenshot(valid_image): try: with open(valid_image, "rb") as f: return HttpResponse(f.read(), content_type="image/png") except IOError: red = Image.new('RGBA', (1, 1), (255, 0, 0, 0)) response = HttpResponse(content_type="image/png") red.save(response, "PNG") return response else: return HttpResponse("Image save failed", status=500) except KeyError: return HttpResponse("KeyError", status=500) return HttpResponse(status=500)
def contentreplace(text, out=True): if not isinstance(text, str): return text elif not text.strip(): return text text = __basereplace(text) re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style re_a = re.compile('<\s*a[^>]*>[^<]*<\s*/\s*a\s*>', re.I) # a text = re_script.sub('', text) text = re_style.sub('', text) text = re_a.sub('', text) text = text.split('\n') html = '' i = 0 l = int(len(text) * 0.3) l = l if l > 3 else 3 for row in text: i += 1 row = row.strip() row = descriptionreplace(row) if row: if out: if l == i: html += f"<p>$$$$$$1{row}$$$$$$2</p>" else: html += f"<p>{row}</p>" else: html += f"{row}\n\r" if out: html = re.sub(r':[a-zA-Z0-9_]+?:', '', html) html = customfilterstr(html) #emoji的处理 e = [ ':thumbs_up:', ':ghost:', ':fire:', ':monkey:', ':dog:', ':poodle:', ':mouse:', ':rat:', ':rabbit:', ':red_apple:' ] html = \ html.replace('$$$$$$1',str(random.choice(e))).replace('$$$$$$2', str(random.choice(e))) html = emoji.emojize(html) html = html.replace('口', '𥁐') return html
def get_preview(html, characters): spc = spec_characters.findall(html) for sp in spc: html = html.replace(sp, "") links = link_re.findall(html) for link in links: html = html.replace(link[0], "") bbcode = bbcode_re.findall(html) for code in bbcode: html = html.replace(code[0], "") soup = BeautifulSoup(html, "lxml") text = soup.getText() if len(text) > characters: text = text[:characters] + "..." return text
def getVideoStreamURL(url): result = {} html = getURL(url) html = html.replace('"+(md.device == \'mobile\'?\'640/360\':\'870/490\')', '/500/280/size16x9"') source = re.findall('sources: [\[\s]*\{([^\}]*)\}', html, re.DOTALL) if not source: return result source = re.sub(re.compile('\n[\s]*(\/\/[^\n]*)', re.DOTALL), '', source[0]) url_hash = url.split('#', 1) if len(url_hash) == 2: url_hash = url_hash[1] else: url_hash = '' source = re.sub( re.compile('("[\+\s]*location.hash.substring\(1\))', re.DOTALL), url_hash + '"', source) source = source.replace('"file"', 'file') mfile = re.findall('file[:\s]*"(.*?)"', source, re.DOTALL) result['url'] = mfile[0].replace('\/', '/') image = re.findall('image: "(.*?)"', html, re.DOTALL) if image: result['image'] = LRT_URL + image[0] return result
def getVideoStreamURL(url): result = {} html = getURL(url) html = html.replace('"+(md.device == \'mobile\'?\'640/360\':\'870/490\')', '/500/280/size16x9"') source = re.findall('sources: [\[\s]*\{([^\}]*)\}', html, re.DOTALL) if not source: return result source = re.sub(re.compile('\n[\s]*(\/\/[^\n]*)', re.DOTALL), '', source[0]) url_hash = url.split('#', 1) if len(url_hash) == 2: url_hash = url_hash[1] else: url_hash = ''; source = re.sub(re.compile('("[\+\s]*location.hash.substring\(1\))', re.DOTALL), url_hash + '"', source) source = source.replace('"file"', 'file') mfile = re.findall('file[:\s]*"(.*?)"' ,source, re.DOTALL) result['url'] = mfile[0].replace('\/','/') image = re.findall('image: "(.*?)"', html, re.DOTALL) if image: result['image'] = LRT_URL + image[0] return result
def quest_tooltip(req): quest_id = req.GET.get("id", 0) nocache = req.GET.get("nocache", "False") == "True" res_type = req.GET.get("type", "web") print("quest_id:{}".format(quest_id)) try: if quest_id: try: quest = PlotQuest.objects.get(id=quest_id) except PlotQuest.DoesNotExist: return HttpResponse("No such quest", status=500) else: if res_type == "web": if quest.tooltip_html == "" or nocache: r = requests.get( "https://ff14.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D" .format(quest_id)) r_json = r.json() # print(r_json) html = r_json["parse"]["text"]["*"] html = html.replace( "class=\"tooltip-item\"", "class=\"tooltip-item\" id=\"tooltip\"", 1) html = html.replace( "href=\"/", "href=\"https://ff14.huijiwiki.com/") soup = BeautifulSoup(html, 'html.parser') quest_name = soup.p.span.string a = soup.new_tag( 'a', href= 'https://ff14.huijiwiki.com/wiki/%E4%BB%BB%E5%8A%A1:{}' .format(urllib.parse.quote(quest_name))) a.string = quest_name soup.p.span.string = "" soup.p.span.append(a) html = str(soup) quest.tooltip_html = html quest.save(update_fields=["tooltip_html"]) else: html = quest.tooltip_html return ren2res("quest_tooltip.html", req, {"parsed_html": html}) elif res_type == "img" or res_type == "image": return HttpResponse("TODO", status=500) except KeyError: return HttpResponse("KeyError", status=500) return HttpResponse(status=500)
def insert_line_numbers(html): match = re.search("(<pre[^>]*>)(.*)(</pre>)", html, re.DOTALL) if not match: return html pre_open = match.group(1) pre = match.group(2) pre_close = match.group(3) html = html.replace(pre_close, "</pre></td></tr></table>") numbers = range(1, pre.count("\n") + 1) format = "%" + str(len(str(numbers[-1]))) + "i" lines = "\n".join(format % i for i in numbers) html = html.replace( pre_open, "<table><tr><td>" + pre_open + lines + "</pre></td><td>" + pre_open) return html
def first_synonym(self, html): """ Return the first synonym found and html without his marking. """ synonym = Utils.text_between(html, *TAG_SYNONYMS_DELIMITER, force_html=True) synonym = Utils.remove_spaces(synonym) _html = html.replace(TAG_SYNONYMS_DELIMITER[0], "", 1) _html = _html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1) return Word(synonym), _html
def replaceBadHtmlWithGood(html): html = html.replace('dir="ltr"',"") html = remove_blank_paras(html) soup = BeautifulSoup(html, "html.parser") soup = replaceImgHeightWidthWithClass(soup) soup = replacePImgWithFigureImg(soup) soup = fixEditorSummary(soup) soup = removeGoogleDocsSpans(soup) soup = processYouTubeDivs(soup) soup = processSoundCloudDivs(soup) return str(soup)
def replaceBadHtmlWithGood(html): html = html.replace('dir="ltr"', "") html = remove_unnecessary_white_space(html) soup = BeautifulSoup(html, "html.parser") soup = replaceImgHeightWidthWithClass(soup) # While nice to make images into figures, it is a real struggle for # users of CKEditor. # soup = replacePImgWithFigureImg(soup) soup = fixEditorSummary(soup) soup = removeGoogleDocsSpans(soup) soup = processDashes(soup) soup = processYouTubeDivs(soup) soup = processSoundCloudDivs(soup) soup = linkImages(soup) warnImageTooBig(soup) return str(soup)
def mangahere(url, download_chapters): html = get_html(url) global last series = title(re.search('<(h1 class=")?title"?><span class="title_icon"></span>(.*?)</(h1|title)>', html.replace('\n', '')).group(1)) status = re.search('<li><label>Status:</label>(.*?)<', html.replace('\n', '')).group(1) author = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<li><label>Author\\(?s?\\)?:</label>(.*?)</li>', html.replace('\n', '')).group(1))) tags = re.search('<li><label>Genre\\(s\\):</label>(.*?)</li>', html).group(1).split(', ') for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<li>\\s*<span class=\"left\">\\s*(.*?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE)[::-1]: match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE) name = match.group(2) num = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1))) link = re.search('href=\"(.*?)\"', j).group(1) try: date = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d') except: date = datetime.datetime.today().strftime('%Y-%m-%d') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub('001.([A-Za-z]{3})', '{:03}.\\1', re.search('<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL|re.MULTILINE).group(1)) if '{:03}' not in img_url and '{}' not in img_url: img_url = re.sub('01.([A-Za-z]{3})', '{:02}.\\1', img_url) pages = max([int(i) for i in re.findall('<option value=\".*?\".*?>(\\d+)</option>', chap_html)]) b_links = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>(\\d+)</option>', chap_html)} b_links = [b_links[i+1] for i in range(pages)] links = [img_url.format(i+1) for i in range(pages)] chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num}) if chapters: function_name(chapters, series, tags, author, status)
def replace_entities(html): unifiable = [ ('&', '&'), (' ', ' '), (''', "'"), ('"', "'"), ('–', '-'), ('—', u'–'), ('→', u'→'), ('←', u'←'), ('⇆', u'↔'), ('“', '"'), ('”', '"'), ] for (entity, char) in unifiable: html = html.replace(entity, char) return html
def patch_html(html): """Patch anchor elements to specify the target attribute The links created by the tagstatlink option will fail to open when viewed within a frame. Even if that weren't the case, I don't think we want them to open up in the frame inside a metaci test result page. This adds `target=_top` to the generated links. """ # Yeah, I know patching HTML is fraught with peril. The robot # code to generate the logs is pretty stable, so I think # this is a reasonably safe way to do it. It results in a # much better experience for our users. html = html.replace( r'<span>[<a href="{{html $value.url}}" title="{{html $value.url}}">', r'<span>[<a href="{{html $value.url}}" title="{{html $value.url}}" target="_top">', ) return html
def download_table(html): marker = "[#download table#]" if marker not in html: return html imgs_url = config.download.imgs_url latest_release = config.download.latest_release new = "<table class='table-specs'>\n" for category, category_cfg in config.download.table.items(): new += f"<tr><td colspan='4'><b>{category}</b></td></tr>\n" for device, device_cfg in category_cfg.items(): name = device_cfg["name"] link = f"{imgs_url}/{latest_release}/{device}/" new += "<tr><td style='padding: 0px 10px'>\n" \ f"\t<a href='{link}'>{name}</a>\n" \ "</td></tr>\n" new += "</table>\n" return html.replace(marker, new)
def insertMetaData(html): for key in META: list = [] for format, software in sorted(META[key][0].items()): formatstring = "<li class=\"cat-item cat-item-3\"><input name=\"filterCheckbox\" type=\"checkbox\" onclick=\"handleClick();\" data-software=\"" nameList = [] for s in software: nameList.append(s.replace(" ", "-").lower()) ''' formatstring = "<li class=\"cat-item cat-item-3\"><details><summary><label>" + format + "</label></summary><ul class=\"children\">" for s in software: formatstring += "<li class=\"cat-item cat-item-3\"><a href=\"#" + s.replace(" ", "-").lower() + "\">" + s + "</a></li></br>" ''' formatstring += " ".join( nameList) + "\">" + format + "</input></li>" list.append(formatstring) html = html.replace("<!--{" + key + "}-->", " ".join(list)) return html
def _quote_html(html): return html.replace("&", "&").replace("<", "<").replace(">", ">")
def autoBrText(self,html): return html.replace('\n', '<br />')
def _quote_html(html): return html.replace('&', '&').replace('<', '<').replace('>', '>')
def quest_tooltip(req): quest_id = req.GET.get("id", 0) nocache = req.GET.get("nocache", "False") == "True" res_type = req.GET.get("type", "web") print("quest_id:{}".format(quest_id)) try: if quest_id: try: quest = PlotQuest.objects.get(id=quest_id) except PlotQuest.DoesNotExist: return HttpResponse("No such quest", status=500) else: if res_type == "web": if quest.tooltip_html == "" or nocache: r = requests.get( "https://ff14.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D" .format(quest_id)) r_json = r.json() # print(r_json) html = r_json["parse"]["text"]["*"] html = html.replace( "class=\"tooltip-item\"", "class=\"tooltip-item\" id=\"tooltip\"", 1) html = html.replace( "href=\"/", "href=\"https://ff14.huijiwiki.com/") soup = BeautifulSoup(html, 'html.parser') quest_name = soup.p.span.string a = soup.new_tag( 'a', href= 'https://ff14.huijiwiki.com/wiki/%E4%BB%BB%E5%8A%A1:{}' .format(urllib.parse.quote(quest_name))) a.string = quest_name soup.p.span.string = "" soup.p.span.append(a) html = str(soup) quest.tooltip_html = html quest.save(update_fields=["tooltip_html"]) else: html = quest.tooltip_html return ren2res("quest_tooltip.html", req, {"parsed_html": html}) elif res_type == "img" or res_type == "image": return HttpResponse("TODO", status=500) from selenium import webdriver options = webdriver.ChromeOptions() options.add_argument('--kiosk') options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=options) driver.get( "https://xn--v9x.net/quest/tooltip/?id={}".format( quest_id)) tooltip = driver.find_element_by_id("tooltip") valid_image = "tooltip.png" if tooltip.screenshot(valid_image): try: with open(valid_image, "rb") as f: return HttpResponse(f.read(), content_type="image/png") except IOError: red = Image.new('RGBA', (1, 1), (255, 0, 0, 0)) response = HttpResponse(content_type="image/png") red.save(response, "PNG") return response else: return HttpResponse("Image save failed", status=500) except KeyError: return HttpResponse("KeyError", status=500) return HttpResponse(status=500)
def goodmanga(url, download_chapters): html = get_html(url) global last series = title(re.search('<h1>([^<>]*?)</h1>', html.replace('\n', '')).group(1)) status = re.search('<span>Status:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1) author = re.search('<span>Authors?:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1) tags = re.findall('<a.*?>(.*?)</a>', re.search('<span>Genres:</span>(.*?)\\s*</div>', html, re.DOTALL|re.MULTILINE).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] while True: for j in re.findall('<li>\\s*(.{1,300}?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE): match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE) name = match.group(2) num = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1))) link = re.search('href=\"(.*?)\"', j).group(1) try: date = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d') except: date = datetime.datetime.today().strftime('%Y-%m-%d') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub( '1.([jpgnig]{3})', '{}.\\1', re.search( '</div>\\s*<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL|re.MULTILINE ).group(1) ) found_pages = re.findall( '<option value=\"(.*?)\".*?>\\s*(\\d+)\\s*</option>', chap_html ) pages = max([int(i) for _,i in found_pages]) b_links = {float(i[1]):i[0] for i in found_pages} b_links = [b_links[i+1] for i in range(pages)] links = [img_url.format(i+1) for i in range(pages)] chapters.insert(0, { 'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num } ) match = re.search('<a href=\"(.*?)\">Next</a>', html) if match: html = get_html(match.group(1)) else: break if chapters: function_name(chapters, series, tags, author, status)
def remove_unnecessary_white_space(html): html = html.replace('<p><br/></p>', '') html = re.sub(' ', ' ', html) html = re.sub(' +', ' ', html) return blankpara_regex.sub(r'', html)
def batoto(url, download_chapters): login_batoto() for i in range(3): try: html = get_html(url+'/') break except: if i == 2: raise else: pass global last global session series = title(re.search('<h1.*?>[\\s\n]*(.*?)[\\s\n]*</h1>', html, re.DOTALL|re.MULTILINE).group(1)) status = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))) tags = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)) for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] for j in re.findall('<tr class=\"row lang_([A-Za-z]*?) chapter_row\".*?>(.*?)</tr>', html, re.DOTALL|re.MULTILINE)[::-1]: if j[0] == lang: match = re.search('<a href=\"([^\"]*?)\".*?>\\s*<img.*?>\\s*([^\"<>]*)(\\s*:\\s*)?(.*?)\\s*</a>', j[1], re.DOTALL|re.MULTILINE) name = match.group(4) m2 = re.search('[Cc]h(ap)?(ter)?\\.?\\s*([Ee]xtras?:?)?\\s*[\\.:-]?\\s*([\\d\\.,]+)?\\s*(-\\s*[\\d\\.]+)?', match.group(2)) try: if m2.group(3): num = 0 else: num = m2.group(4).replace(',', '.') num = float(''.join(num.rsplit('.', num.count('.')-1))) except: logger.debug(j[1]) raise ''' #TODO if m2.group(3): if chapters: num = chapters[-1]['num'] + .4 else: num = last + .4 ''' try: vol = int(re.search('[Vv]ol(ume)?\\.\\s*(\\d+)', match.group(2)).group(2)) except: vol = 0 link = match.group(1) uuid = link.rpartition('#')[2] ref = link.rpartition('/')[0]+'/' + "reader#" + uuid + "_1" head = {'Referer':ref, 'supress_webtoon':'t'} link = link.rpartition('/')[0]+'/'+ 'areader?id='+uuid+'&p=1' session.headers.update(head) try: date = datetime.strptime(re.search('<td.*?>(\\d{2} [A-Za-z]* \\d{4}.*?([Aa][Mm]|[Pp][Mm])).*?</td>', j[1]).group(1), '%d %B %Y - %I:%M %p').strftime('%Y-%m-%dT%H:%M:00') except: try: t = re.search('(\\d+) [Mm]inutes ago', j[1]).group(1) except: t = '1' if re.search('A minute ago', j[1]) else '' if t: unit = '%M' else: try: t = re.search('(\\d+) [Hh]ours ago', j[1]).group(1) except: t = '1' if re.search('An hour ago', j[1]) else '' if t: unit = '%H' else: try: t = re.search('(\\d+) [Dd]ays ago', j[1]).group(1) except: t = '1' if re.search('A day ago', j[1]) else '' if t: unit = '%d' else: try: t = re.search('(\\d+) [Ww]eeks ago', j[1]).group(1) except: t = '1' if re.search('A week ago', j[1]) else '' if t: unit = '%W' else: t = '0' unit = '%M' date = datetime.fromtimestamp((datetime.today()-datetime.strptime(t, unit)).total_seconds()).strftime('%Y-%m-%dT%H:%M:00') if name: name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name) else: name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5)) if (download_chapters and num in download_chapters) or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) chap_html = get_html(link) img_url = re.sub('001\\.([A-Za-z]{3})', '{:03}.\\1', re.search('<div.*?>\\s*<a.*?>\\s*<img[^<]*?src=\"([^\"]*?)\"[^>]*?/>\\s*</div>', chap_html, re.DOTALL|re.MULTILINE).group(1)) zero = False if '{:03}' not in img_url: img_url = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url) zero = True if '{:03}' not in img_url: img_url = re.sub('01\\.([A-Za-z]{3})', '{:02}.\\1', img_url) zero = False if '{:02}' not in img_url: img_url = re.sub('00\\.([A-Za-z]{3})', '{:02}.\\1', img_url) zero = True if re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html): pages = max([int(i) for i in re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html)]) else: continue b_links = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>page (\\d+)</option>', chap_html)} b_links = [b_links[i+1] for i in range(pages)] if zero: links = [img_url.format(i) for i in range(pages)] else: links = [img_url.format(i+1) for i in range(pages)] chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num}) if chapters: function_name(chapters, series, tags, author, status)
def get_table_html_from_dataframe(df): html = df.to_html(classes='table table-bordered table-responsive table-striped') return html.replace('<thead>', '<thead class="thead-light">')
def to_oneline(html): return html.replace("<br />", ' ').replace("</p><p>", ' ').replace('\n', '\\n')
def mangadex(url, download_chapters, pageNum=None): login_mangadex() if url.endswith('/'): url = re.sub('/+$', '', url) if pageNum is not None: url += '/chapters/{page}/'.format(page=pageNum) for i in range(3): try: html = get_html(url) break except: if i == 2: raise else: pass global last global session try: series = title(re.sub('<[^>]+>', '', re.search('<h3 class="panel-title">(.*)</h3>', html).group(1)).strip()) status = re.search('<th.*?>Pub. status:</th>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1) author = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<th.*?>\\s*Authors?\\s*:?\\s*</th>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))) tags = re.findall(r'<span.*?>\s*<a.*?>\s*([A-Za-z]*?)\s*</a>\s*</span>', re.search(r'<th.*?>\s*Genres?\s*:?\s*</th>\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)) except: logger.exception('url: %s', url) raise for j in range(len(tags)): for k in tag_dict: tags[j] = re.sub(k, tag_dict[k], tags[j]) chapters = [] max_page = re.search( 'href=["\'][^"\']+?\\D(\\d+)/?[\'"][^<>]*?>\\s*<[^<>]*?Jump to last page', html ) if pageNum == None and max_page: max_page = int(max_page.group(1)) for page in range(max_page, 0, -1): chapters.extend(mangadex(url, download_chapters, page)) else: found_chaps = re.findall( r'<td>\s*(<a[^>]+href=./chapter/.*?)</tr>', html, re.DOTALL|re.MULTILINE ) for j in found_chaps[::-1]: if lang in j: try: match = re.search( r'<a[^>]+href=\"([^\"]*?)\".*?>\s*(.*?)\s*</a>', j, re.DOTALL|re.MULTILINE ) m2 = re.search( r'([Cc]h(ap)?(ter)?\.?|([Ee]xtra|[Ss]pecial)s?:?)' + \ r'\s*[\.:-]?\s*([\d\.,]+)?\s*(-\s*[\d\.]+)?', match.group(2) ) name = match.group(2).replace( m2.group(0) if m2 else match.group(2), '' ) logger.debug('found chapter: %s', match.group(2)) if not m2 or m2.group(4): num = 0 else: num = m2.group(5).replace(',', '.') num = float(''.join(num.rsplit('.', num.count('.')-1))) except: logger.debug(j) raise ''' #TODO if m2.group(3): if chapters: num = chapters[-1]['num'] + .4 else: num = last + .4 ''' try: vol = re.search(r'[Vv]ol(ume)?\.?\s*(\d+)', match.group(2)) name = name.replace(vol.group(0), '').strip() name = re.sub(r'^\s*-? ?(Read On[ -]?line)?\s*', '', name, re.I) vol = int(vol.group(2)) except: vol = 0 link = 'https://mangadex.com/{}/'.format(match.group(1)) date = re.search( 'datetime=\"(.*?)( [A-Z]{3})?\"', j ).group(1).replace(' ', 'T') strNum = '{:3.2f}'.format(num).zfill(5) if name: name = '{} - {} : {}'.format(series, strNum, name) else: name = '{} - {}'.format(series, strNum) if (download_chapters and num in download_chapters) \ or (not download_chapters and num > last): logger.info(' Gathering info: \"{}\"'.format(name)) img_url = '' for ntry in range(3): try: chap_html = get_html(link+'1') img_url = re.search( '<img[^<]*?id=\"current_page\".*?src=\"([^\"]*?)\"', chap_html, re.DOTALL|re.MULTILINE ).group(1) break except: if ntry == 2: raise logger.debug('original url: %s', img_url) img_url = re.sub('(/?)0*[01]\\.([A-Za-z]{3})$', r'\1{}.\2', img_url) if 'http' not in img_url: img_url = 'https://mangadex.com/' + img_url zero = False if '{' not in img_url: img_url = re.sub(r'(/?)0\.([a-zA-Z]{3})', r'\1{}.\2', img_url) zero = True if '{' not in img_url: img_url = re.sub(r'(/?)01\.([a-zA-Z]{3})', r'\1{:02}.\2', img_url) zero = False if '{' not in img_url: img_url = re.sub('0*1\\.([A-Za-z]{3})', r'{:02}.\1', img_url) zero = False if '{' not in img_url: img_url = re.sub('0*0\\.([A-Za-z]{3})', r'{:02}.\1', img_url) zero = True logger.debug('general url: %s', img_url) found_pages = re.findall( r'<option[^>]+value=[\"\'](.*?)[\'\"].*?>Page (\d+)</option>', chap_html ) if found_pages: pages = max([int(i) for _,i in found_pages]) else: continue b_links = {int(i[1]):link+i[0] for i in found_pages} b_links = sorted(b_links.items(), key=lambda t: t[0]) b_links = ['https://mangadex.com/'+i for _,i in b_links] if zero: links = [img_url.format(i) for i in range(pages)] else: links = [img_url.format(i+1) for i in range(pages)] chapters.append({ 'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num, }) if pageNum is not None: return chapters elif chapters: function_name(chapters, series, tags, author, status)