Python replace 예제들, html.replace Python 예제들

예제 #1

0

파일 보기

파일: manga.py 프로젝트: andy29485/Manga

def mangapanda(url, download_chapters):
  html  = get_html(url)
  global last

  series    = title(re.search('<h1.*?>\\s*(.*?)\\s*</h1>', html, re.DOTALL|re.MULTILINE).group(1)).rpartition(' Manga')[0]
  status    = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1)
  author    = re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)
  tags      = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])
  chapters  = []

  for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html, re.DOTALL|re.MULTILINE):
    match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j)
    num   = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1)))
    name  = match.group(3)
    link  = 'http://www.mangapanda.com' + re.search('<a\\s*href=\"(/.*?)\">', j).group(1)
    date  = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j)
    date  = '{:04}-{:02}-{:02}'.format(int(date.group(3)), int(date.group(1)), int(date.group(2)))

    if name:
      name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
    else:
      name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

    if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
      logger.info('  Gathering info: \"{}\"'.format(name))
      chap_html = get_html(link)
      links     = ['http://www.mangareader.net' + i for i in re.findall('<option value=\"(.*?)\".*?>\\d+</option>', chap_html)]
      chapters.append({'name':name, 'links':links, 'backup_links':links, 'date':date, 'pages':len(links), 'num':num})

  if chapters:
    function_name(chapters, series, tags, author, status)

예제 #2

0

파일 보기

파일: latex.py 프로젝트: wo4wangle/anki

def mungeQA(html, type, fields, model, data, col):
    """html, where LaTeX parts are replaced by some HTML.

    see _imgLink docstring regarding the rules for LaTeX media.

    keyword arguments:
    html -- the text in which to find the LaTeX to be replaced.
    type -- not used. "q" or "a" for question and answer
    fields -- not used. A dictionnary containing Tags, Type(model
    name), Deck, Subdeck(part after last ::), card: template
    name... TODO (see collection._renderQA for more info) 
    model -- the model in which is compiled the note. It deals with
    the header/footer, and the image file format
    data -- not used. [cid, nid, mid, did, ord, tags, flds]
    col -- the current collection. It deals with media folder
    """
    for match in regexps['standard'].finditer(html):
        html = html.replace(match.group(), _imgLink(col, match.group(1), model))
    for match in regexps['expression'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col, "$" + match.group(1) + "$", model))
    for match in regexps['math'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col,
            "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model))
    return html

예제 #3

0

파일 보기

파일: CodeMobject.py 프로젝트: ViniTheSwan/manim

def insert_line_numbers_in_html(html, line_no_from):
    """Function that inserts line numbers in the highlighted HTML code.
    Parameters
    ---------
    html : :class:`str`
        html string of highlighted code.
    line_no_from : :class:`int`
        Defines the first line's number in the line count.
    Returns
    -------
    :class:`str`
        The generated html string with having line numbers.
    """
    match = re.search("(<pre[^>]*>)(.*)(</pre>)", html, re.DOTALL)
    if not match:
        return html
    pre_open = match.group(1)
    pre = match.group(2)
    pre_close = match.group(3)

    html = html.replace(pre_close, "</pre></td></tr></table>")
    numbers = range(line_no_from, line_no_from + pre.count("\n") + 1)
    format = "%" + str(len(str(numbers[-1]))) + "i"
    lines = "\n".join(format % i for i in numbers)
    html = html.replace(
        pre_open,
        "<table><tr><td>" + pre_open + lines + "</pre></td><td>" + pre_open)
    return html

예제 #4

0

파일 보기

파일: latex.py 프로젝트: ZX1209/anki

def mungeQA(
    html: str,
    type: Optional[str],
    fields: Optional[Dict[str, str]],
    model: NoteType,
    data: Optional[List[Union[int, str]]],
    col,
) -> Any:
    "Convert TEXT with embedded latex tags to image links."
    for match in regexps["standard"].finditer(html):
        html = html.replace(match.group(), _imgLink(col, match.group(1),
                                                    model))
    for match in regexps["expression"].finditer(html):
        html = html.replace(match.group(),
                            _imgLink(col, "$" + match.group(1) + "$", model))
    for match in regexps["math"].finditer(html):
        html = html.replace(
            match.group(),
            _imgLink(
                col,
                "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}",
                model,
            ),
        )
    return html

예제 #5

0

파일 보기

파일: html_util.py 프로젝트: webstorage119/nlp4kor

    def trim(html, prefix_url=None):
        """
        코멘트 제거, 자바스크립트 제거 (100.daum.net 제외)
        \r\n -> \n
        html에 포함된 <br>, <p>를 \n 으로 변환
        다수의 공백, \t, \n 을 하나로 합침
        :param html:
        :param prefix_url:
        :return:
        """
        html = html.replace('\r\n', '\n')
        convert_dic = {'<br>': '\n', '<br/>': '\n', '<br />': '\n', '<p>': '\n', '<p/>': '\n', '<p />': '\n',
                       '<BR>': '\n', '<BR/>': '\n', '<BR />': '\n', '<P>': '\n', '<P/>': '\n', '<P />': '\n'}
        for _from, _to in convert_dic.items():
            html = html.replace(_from, _to)
        html = HtmlUtil.remove_comments_in_html(html)  # remove html comments.
        doc = lxml.html.document_fromstring(html)  # convert to html element.r

        if prefix_url:
            doc.make_links_absolute(prefix_url)  # convert links to absolute links.

        if prefix_url:
            if '100.daum.net' not in prefix_url:  # javascript를 지우면 일부가 안 보이는 HTML도 있다. (100.daum.net)
                doc = HtmlUtil.remove_javascripts_in_doc(doc)  # remove javascript elements.
        else:
            doc = HtmlUtil.remove_javascripts_in_doc(doc)  # remove javascript elements.

        html = lxml.html.tostring(doc, encoding='utf8', include_meta_content_type=True)  # convert to html string.
        html = html.decode('utf8')  # bytes -> string
        html = StringUtil.merge(html)  # replace multiple blanks to one blank.
        return html.strip()

예제 #6

0

파일 보기

파일: latex.py 프로젝트: Aqueminivan/anki

def mungeQA(
    html: str,
    type: str,
    fields: Dict[str, str],
    model: NoteType,
    data: QAData,
    col: anki.storage._Collection,
) -> str:
    "Convert TEXT with embedded latex tags to image links."
    for match in regexps["standard"].finditer(html):
        html = html.replace(match.group(), _imgLink(col, match.group(1), model))
    for match in regexps["expression"].finditer(html):
        html = html.replace(
            match.group(), _imgLink(col, "$" + match.group(1) + "$", model)
        )
    for match in regexps["math"].finditer(html):
        html = html.replace(
            match.group(),
            _imgLink(
                col,
                "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}",
                model,
            ),
        )
    return html

예제 #7

0

파일 보기

    def get_page_from_html(cls, html: str) -> _Element:
        first_letter_regex = '<span class="firstLetter">.<\/span>'
        first_letter_spans = re.findall(first_letter_regex, html)

        for first_letter_span in first_letter_spans:
            letter = first_letter_span[26:-7]
            html = html.replace(first_letter_span, letter, 1)

        quoted_text_regex = '<span class="gathaQuote">.+?(?=>)>'
        quoted_text_spans = re.findall(quoted_text_regex, html)

        for quoted_text_span in quoted_text_spans:
            quoted_text = quoted_text_span[25:-7]
            html = html.replace(quoted_text_span, quoted_text, 1)

        bold_text_regex = '<span class="bold">.+?(?=>)>'
        bold_text_spans = re.findall(bold_text_regex, html)

        for bold_text_span in bold_text_spans:
            #print(html)
            bold_text = bold_text_span[19:-7]
            html = html.replace(bold_text_span, bold_text, 1)
            #print(html)

        return fromstring(html.replace("<br>", "<br/>"))

예제 #8

0

파일 보기

파일: manga.py 프로젝트: Lex9z/Manga

def mangapanda(url, download_chapters):
    html = get_html(url)
    global last

    series = title(
        re.search('<h1.*?>\\s*(.*?)\\s*</h1>', html,
                  re.DOTALL | re.MULTILINE).group(1)).rpartition(' Manga')[0]
    status = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>',
                       html.replace('\n', '')).group(1)
    author = re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>',
                       html.replace('\n', '')).group(1)
    tags = re.findall(
        '<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>',
        re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>',
                  html.replace('\n', '')).group(1))
    for j in range(len(tags)):
        for k in tag_dict:
            tags[j] = re.sub(k, tag_dict[k], tags[j])
    chapters = []

    for j in re.findall('<tr>\\s*<td>\\s*<div.*?</div>(.*?)</tr>', html,
                        re.DOTALL | re.MULTILINE):
        match = re.search('<a.*?([\\d.,-]+)</a>(\\s*:\\s*)(.*?)\\s*</td>', j)
        num = float(match.group(1))
        name = match.group(3)
        link = 'http://www.mangapanda.com' + re.search(
            '<a\\s*href=\"(/.*?)\">', j).group(1)
        date = re.search('<td>(\\d{2})/(\\d{2})/(\\d{4})</td>', j)
        date = '{:04}-{:02}-{:02}'.format(int(date.group(3)),
                                          int(date.group(1)),
                                          int(date.group(2)))

        if name:
            name = '{} - {} : {}'.format(series,
                                         '{:3.1f}'.format(num).zfill(5), name)
        else:
            name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

        if (download_chapters
                and num in download_chapters) or (not download_chapters
                                                  and num > last):
            logger.info('  Gathering info: \"{}\"'.format(name))
            chap_html = get_html(link)
            links = [
                'http://www.mangareader.net' + i for i in re.findall(
                    '<option value=\"(.*?)\".*?>\\d+</option>', chap_html)
            ]
            chapters.append({
                'name': name,
                'links': links,
                'backup_links': links,
                'date': date,
                'pages': len(links),
                'num': num
            })

    if chapters:
        function_name(chapters, series, tags, author, status)

예제 #9

0

파일 보기

파일: elog.py 프로젝트: ess-dmsc/nicos

    def on_actionPrint_triggered(self):
        # Let the user select the desired printer via the system printer list
        printer = QPrinter()
        dialog = QPrintDialog(printer)

        if not dialog.exec_():
            return

        mainFrame = self.preview.page().mainFrame()
        childFrames = mainFrame.childFrames()

        # Workaround for Qt versions < 4.8.0
        printWholeSite = True
        if hasattr(QWebView, 'selectedHtml'):
            if self.preview.hasSelection():
                printWholeSite = False

        # use whole frame if no content is selected or selecting html is not
        # supported
        if printWholeSite:
            # set 'content' frame active as printing an inactive web frame
            # doesn't work properly

            if len(childFrames) >= 2:
                childFrames[1].setFocus()

                # thanks to setFocus, we can get the print the frame
                # with evaluated javascript
                html = childFrames[1].toHtml()
        else:
            html = self.preview.selectedHtml()

            # construct head
            head = '<head>'

            # extract head from child frames
            for frame in childFrames:
                headEl = frame.findFirstElement('head')
                head += headEl.toInnerXml()

            head += '</head>'

            # concat new head and selection
            # the result may be invalid html; needs improvements!
            html = head + html

        # prepend a header to the log book
        html.replace('</head>', '</head><h1>NICOS Log book</h1>')

        # let qt layout the content
        doc = QTextDocument()
        doc.setHtml(html)

        doc.print_(printer)

예제 #10

0

파일 보기

파일: latex.py 프로젝트: Glutanimate/anki

def mungeQA(html, type, fields, model, data, col):
    "Convert TEXT with embedded latex tags to image links."
    for match in regexps['standard'].finditer(html):
        html = html.replace(match.group(), _imgLink(col, match.group(1), model))
    for match in regexps['expression'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col, "$" + match.group(1) + "$", model))
    for match in regexps['math'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col,
            "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model))
    return html

예제 #11

0

파일 보기

파일: latex.py 프로젝트: solarmist/anki

def mungeQA(html, type, fields, model, data, col):
    "Convert TEXT with embedded latex tags to image links."
    for match in regexps['standard'].finditer(html):
        html = html.replace(match.group(), _imgLink(col, match.group(1), model))
    for match in regexps['expression'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col, "$" + match.group(1) + "$", model))
    for match in regexps['math'].finditer(html):
        html = html.replace(match.group(), _imgLink(
            col,
            "\\begin{displaymath}" + match.group(1) + "\\end{displaymath}", model))
    return html

예제 #12

0

파일 보기

파일: code_mobject.py 프로젝트: unAlpha/AgManim

def insert_line_numbers(html):
    match = re.search('(<pre[^>]*>)(.*)(</pre>)', html, re.DOTALL)
    if not match: return html

    pre_open = match.group(1)
    pre = match.group(2)
    pre_close = match.group(3)

    html = html.replace(pre_close, '</pre></td></tr></table>')
    numbers = range(1, pre.count('\n') + 1)
    format = '%' + str(len(str(numbers[-1]))) + 'i'
    lines = '\n'.join(format % i for i in numbers)
    html = html.replace(pre_open, '<table><tr><td>' + pre_open + lines + '</pre></td><td>' + pre_open)
    return html

예제 #13

0

파일 보기

파일: utilities.py 프로젝트: flowmar/lamia-community

def strip_tags(html):
    spc = spec_characters.findall(html)
    for sp in spc:
        html = html.replace(sp, "")
    links = link_re.findall(html)
    for link in links:
        html = html.replace(link[0], "")
    bbcode = bbcode_re.findall(html)
    for code in bbcode:
        html = html.replace(code[0], "")
    soup = BeautifulSoup(html, "lxml")
    text = soup.getText()
    words = words_re.findall(text)
    return words

예제 #14

0

파일 보기

def quest_tooltip(req):
    quest_id = req.GET.get("id", 0)
    nocache = req.GET.get("nocache", "False") == "True"
    res_type = req.GET.get("type", "web")
    print("quest_id:{}".format(quest_id))
    try:
        if quest_id:
            if res_type == "web":
                r = requests.get(
                    "https://cdn.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D"
                    .format(quest_id))
                r_json = r.json()
                print(r_json)
                html = r_json["parse"]["text"]["*"]
                html = html.replace("class=\"tooltip-item\"",
                                    "class=\"tooltip-item\" id=\"tooltip\"", 1)
                html = html.replace("href=\"/",
                                    "href=\"https://ff14.huijiwiki.com/")
                return ren2res("quest_tooltip.html", req,
                               {"parsed_html": html})
            elif res_type == "img" or res_type == "image":
                return HttpResponse("TODO", status=500)
                from selenium import webdriver
                options = webdriver.ChromeOptions()
                options.add_argument('--kiosk')
                options.add_argument('--headless')
                options.add_argument('--no-sandbox')
                options.add_argument('--disable-gpu')
                driver = webdriver.Chrome(chrome_options=options)
                driver.get("https://xn--v9x.net/quest/tooltip/?id={}".format(
                    quest_id))
                tooltip = driver.find_element_by_id("tooltip")
                valid_image = "tooltip.png"
                if tooltip.screenshot(valid_image):
                    try:
                        with open(valid_image, "rb") as f:
                            return HttpResponse(f.read(),
                                                content_type="image/png")
                    except IOError:
                        red = Image.new('RGBA', (1, 1), (255, 0, 0, 0))
                        response = HttpResponse(content_type="image/png")
                        red.save(response, "PNG")
                        return response
                else:
                    return HttpResponse("Image save failed", status=500)

    except KeyError:
        return HttpResponse("KeyError", status=500)
    return HttpResponse(status=500)

예제 #15

0

파일 보기

def contentreplace(text, out=True):
    if not isinstance(text, str):
        return text
    elif not text.strip():
        return text

    text = __basereplace(text)
    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  # Script
    re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',
                          re.I)  # style
    re_a = re.compile('<\s*a[^>]*>[^<]*<\s*/\s*a\s*>', re.I)  # a
    text = re_script.sub('', text)
    text = re_style.sub('', text)
    text = re_a.sub('', text)

    text = text.split('\n')
    html = ''
    i = 0
    l = int(len(text) * 0.3)
    l = l if l > 3 else 3

    for row in text:
        i += 1
        row = row.strip()
        row = descriptionreplace(row)
        if row:
            if out:
                if l == i:
                    html += f"<p>$$$$$$1{row}$$$$$$2</p>"
                else:
                    html += f"<p>{row}</p>"
            else:
                html += f"{row}\n\r"

    if out:
        html = re.sub(r':[a-zA-Z0-9_]+?:', '', html)
        html = customfilterstr(html)
        #emoji的处理
        e = [
            ':thumbs_up:', ':ghost:', ':fire:', ':monkey:', ':dog:',
            ':poodle:', ':mouse:', ':rat:', ':rabbit:', ':red_apple:'
        ]
        html = \
            html.replace('$$$$$$1',str(random.choice(e))).replace('$$$$$$2', str(random.choice(e)))
        html = emoji.emojize(html)
        html = html.replace('口', '𥁐')

    return html

예제 #16

0

파일 보기

파일: utilities.py 프로젝트: flowmar/lamia-community

def get_preview(html, characters):
    spc = spec_characters.findall(html)
    for sp in spc:
        html = html.replace(sp, "")
    links = link_re.findall(html)
    for link in links:
        html = html.replace(link[0], "")
    bbcode = bbcode_re.findall(html)
    for code in bbcode:
        html = html.replace(code[0], "")
    soup = BeautifulSoup(html, "lxml")
    text = soup.getText()
    if len(text) > characters:
        text = text[:characters] + "..."
    return text

예제 #17

0

파일 보기

def getVideoStreamURL(url):
    result = {}

    html = getURL(url)
    html = html.replace('"+(md.device == \'mobile\'?\'640/360\':\'870/490\')',
                        '/500/280/size16x9"')

    source = re.findall('sources: [\[\s]*\{([^\}]*)\}', html, re.DOTALL)

    if not source:
        return result

    source = re.sub(re.compile('\n[\s]*(\/\/[^\n]*)', re.DOTALL), '',
                    source[0])

    url_hash = url.split('#', 1)
    if len(url_hash) == 2:
        url_hash = url_hash[1]
    else:
        url_hash = ''

    source = re.sub(
        re.compile('("[\+\s]*location.hash.substring\(1\))', re.DOTALL),
        url_hash + '"', source)
    source = source.replace('"file"', 'file')

    mfile = re.findall('file[:\s]*"(.*?)"', source, re.DOTALL)

    result['url'] = mfile[0].replace('\/', '/')

    image = re.findall('image: "(.*?)"', html, re.DOTALL)
    if image:
        result['image'] = LRT_URL + image[0]

    return result

예제 #18

0

파일 보기

파일: liblrt.py 프로젝트: Vytax/plugin.video.lrt.lt

def getVideoStreamURL(url):
  result = {}
  
  html = getURL(url)
  html = html.replace('"+(md.device == \'mobile\'?\'640/360\':\'870/490\')', '/500/280/size16x9"')
  
  source = re.findall('sources: [\[\s]*\{([^\}]*)\}', html, re.DOTALL)
  
  if not source:
    return result  
  
  source = re.sub(re.compile('\n[\s]*(\/\/[^\n]*)', re.DOTALL), '', source[0])
  
  url_hash = url.split('#', 1)
  if len(url_hash) == 2:
    url_hash = url_hash[1]
  else:
    url_hash = '';
  
  source = re.sub(re.compile('("[\+\s]*location.hash.substring\(1\))', re.DOTALL), url_hash + '"', source)
  source = source.replace('"file"', 'file')      
 
  mfile = re.findall('file[:\s]*"(.*?)"' ,source, re.DOTALL)
  
  result['url'] = mfile[0].replace('\/','/')
    
  image = re.findall('image: "(.*?)"', html, re.DOTALL)
  if image:
    result['image'] = LRT_URL + image[0]
  
  return result

예제 #19

0

파일 보기

def quest_tooltip(req):
    quest_id = req.GET.get("id", 0)
    nocache = req.GET.get("nocache", "False") == "True"
    res_type = req.GET.get("type", "web")
    print("quest_id:{}".format(quest_id))
    try:
        if quest_id:
            try:
                quest = PlotQuest.objects.get(id=quest_id)
            except PlotQuest.DoesNotExist:
                return HttpResponse("No such quest", status=500)
            else:
                if res_type == "web":
                    if quest.tooltip_html == "" or nocache:
                        r = requests.get(
                            "https://ff14.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D"
                            .format(quest_id))
                        r_json = r.json()
                        # print(r_json)
                        html = r_json["parse"]["text"]["*"]
                        html = html.replace(
                            "class=\"tooltip-item\"",
                            "class=\"tooltip-item\" id=\"tooltip\"", 1)
                        html = html.replace(
                            "href=\"/", "href=\"https://ff14.huijiwiki.com/")
                        soup = BeautifulSoup(html, 'html.parser')
                        quest_name = soup.p.span.string
                        a = soup.new_tag(
                            'a',
                            href=
                            'https://ff14.huijiwiki.com/wiki/%E4%BB%BB%E5%8A%A1:{}'
                            .format(urllib.parse.quote(quest_name)))
                        a.string = quest_name
                        soup.p.span.string = ""
                        soup.p.span.append(a)
                        html = str(soup)
                        quest.tooltip_html = html
                        quest.save(update_fields=["tooltip_html"])
                    else:
                        html = quest.tooltip_html
                    return ren2res("quest_tooltip.html", req,
                                   {"parsed_html": html})
                elif res_type == "img" or res_type == "image":
                    return HttpResponse("TODO", status=500)
    except KeyError:
        return HttpResponse("KeyError", status=500)
    return HttpResponse(status=500)

예제 #20

0

파일 보기

파일: code_mobject.py 프로젝트: schlegelflegel/manim

def insert_line_numbers(html):
    match = re.search("(<pre[^>]*>)(.*)(</pre>)", html, re.DOTALL)
    if not match:
        return html

    pre_open = match.group(1)
    pre = match.group(2)
    pre_close = match.group(3)

    html = html.replace(pre_close, "</pre></td></tr></table>")
    numbers = range(1, pre.count("\n") + 1)
    format = "%" + str(len(str(numbers[-1]))) + "i"
    lines = "\n".join(format % i for i in numbers)
    html = html.replace(
        pre_open,
        "<table><tr><td>" + pre_open + lines + "</pre></td><td>" + pre_open)
    return html

예제 #21

0

파일 보기

파일: dicio.py 프로젝트: renatoviolin/dicio

 def first_synonym(self, html):
     """
     Return the first synonym found and html without his marking.
     """
     synonym = Utils.text_between(html, *TAG_SYNONYMS_DELIMITER,
                                  force_html=True)
     synonym = Utils.remove_spaces(synonym)
     _html = html.replace(TAG_SYNONYMS_DELIMITER[0], "", 1)
     _html = _html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1)
     return Word(synonym), _html

예제 #22

0

파일 보기

파일: utils.py 프로젝트: groundupnews/gu

def replaceBadHtmlWithGood(html):
    html = html.replace('dir="ltr"',"")
    html = remove_blank_paras(html)
    soup = BeautifulSoup(html, "html.parser")
    soup = replaceImgHeightWidthWithClass(soup)
    soup = replacePImgWithFigureImg(soup)
    soup = fixEditorSummary(soup)
    soup = removeGoogleDocsSpans(soup)
    soup = processYouTubeDivs(soup)
    soup = processSoundCloudDivs(soup)
    return str(soup)

예제 #23

0

파일 보기

def replaceBadHtmlWithGood(html):
    html = html.replace('dir="ltr"', "")
    html = remove_unnecessary_white_space(html)
    soup = BeautifulSoup(html, "html.parser")
    soup = replaceImgHeightWidthWithClass(soup)
    # While nice to make images into figures, it is a real struggle for
    # users of CKEditor.
    # soup = replacePImgWithFigureImg(soup)
    soup = fixEditorSummary(soup)
    soup = removeGoogleDocsSpans(soup)
    soup = processDashes(soup)
    soup = processYouTubeDivs(soup)
    soup = processSoundCloudDivs(soup)
    soup = linkImages(soup)
    warnImageTooBig(soup)
    return str(soup)

예제 #24

0

파일 보기

파일: manga.py 프로젝트: andy29485/Manga

def mangahere(url, download_chapters):
  html  = get_html(url)
  global last

  series    = title(re.search('<(h1 class=")?title"?><span class="title_icon"></span>(.*?)</(h1|title)>', html.replace('\n', '')).group(1))
  status    = re.search('<li><label>Status:</label>(.*?)<', html.replace('\n', '')).group(1)
  author    = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<li><label>Author\\(?s?\\)?:</label>(.*?)</li>', html.replace('\n', '')).group(1)))
  tags      = re.search('<li><label>Genre\\(s\\):</label>(.*?)</li>', html).group(1).split(', ')
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])
  chapters  = []

  for j in re.findall('<li>\\s*<span class=\"left\">\\s*(.*?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE)[::-1]:
    match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE)
    name  = match.group(2)
    num   = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1)))
    link  = re.search('href=\"(.*?)\"', j).group(1)
    try:
      date  = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d')
    except:
      date  = datetime.datetime.today().strftime('%Y-%m-%d')

    if name:
      name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
    else:
      name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

    if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
      logger.info('  Gathering info: \"{}\"'.format(name))
      chap_html  = get_html(link)
      img_url   = re.sub('001.([A-Za-z]{3})', '{:03}.\\1', re.search('<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>', chap_html, re.DOTALL|re.MULTILINE).group(1))
      if '{:03}' not in img_url and '{}' not in img_url:
        img_url   = re.sub('01.([A-Za-z]{3})', '{:02}.\\1', img_url)
      pages     = max([int(i) for i in re.findall('<option value=\".*?\".*?>(\\d+)</option>', chap_html)])
      b_links    = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>(\\d+)</option>', chap_html)}
      b_links    = [b_links[i+1] for i in range(pages)]
      links      = [img_url.format(i+1) for i in range(pages)]

      chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num})

  if chapters:
    function_name(chapters, series, tags, author, status)

예제 #25

0

파일 보기

파일: MLStripper.py 프로젝트: dper/pumptweet

def replace_entities(html):
    unifiable = [
        ('&amp;', '&'),
        ('&nbsp;', ' '),
        ('&#39;', "'"),
        ('&quot;', "'"),
        ('&ndash;', '-'),
        ('&mdash;', u'–'),
        ('&rarr;', u'→'),
        ('&larr;', u'←'),
        ('&lrarr;', u'↔'),
        ('&ldquo;', '"'),
        ('&rdquo;', '"'),
    ]

    for (entity, char) in unifiable:
        html = html.replace(entity, char)

    return html

예제 #26

0

파일 보기

def patch_html(html):
    """Patch anchor elements to specify the target attribute

    The links created by the tagstatlink option will fail to
    open when viewed within a frame. Even if that weren't the
    case, I don't think we want them to open up in the frame
    inside a metaci test result page.

    This adds `target=_top` to the generated links.
    """
    # Yeah, I know patching HTML is fraught with peril. The robot
    # code to generate the logs is pretty stable, so I think
    # this is a reasonably safe way to do it. It results in a
    # much better experience for our users.
    html = html.replace(
        r'<span>[<a href="{{html $value.url}}" title="{{html $value.url}}">',
        r'<span>[<a href="{{html $value.url}}" title="{{html $value.url}}" target="_top">',
    )
    return html

예제 #27

0

파일 보기

파일: MLStripper.py 프로젝트: moggers87/pumptweet

def replace_entities(html):
	unifiable = [
		('&amp;', '&'), 
		('&nbsp;', ' '),
		('&#39;', "'"),
		('&quot;', "'"),
		('&ndash;', '-'),
		('&mdash;', u'–'),
		('&rarr;', u'→'),
		('&larr;', u'←'),
		('&lrarr;', u'↔'),
		('&ldquo;', '"'),
		('&rdquo;', '"'),
	]

	for (entity, char) in unifiable:
		html = html.replace(entity, char)

	return html

예제 #28

0

파일 보기

파일: page.py 프로젝트: postmarketOS-mirror/postmarketos.org

def download_table(html):
    marker = "[#download table#]"
    if marker not in html:
        return html

    imgs_url = config.download.imgs_url
    latest_release = config.download.latest_release

    new = "<table class='table-specs'>\n"

    for category, category_cfg in config.download.table.items():
        new += f"<tr><td colspan='4'><b>{category}</b></td></tr>\n"
        for device, device_cfg in category_cfg.items():
            name = device_cfg["name"]
            link = f"{imgs_url}/{latest_release}/{device}/"
            new += "<tr><td style='padding: 0px 10px'>\n" \
                   f"\t<a href='{link}'>{name}</a>\n" \
                   "</td></tr>\n"

    new += "</table>\n"
    return html.replace(marker, new)

예제 #29

0

파일 보기

파일: Parser.py 프로젝트: bweyers/HBPVisCatalogue

def insertMetaData(html):
    for key in META:
        list = []

        for format, software in sorted(META[key][0].items()):
            formatstring = "<li class=\"cat-item cat-item-3\"><input name=\"filterCheckbox\" type=\"checkbox\" onclick=\"handleClick();\" data-software=\""
            nameList = []
            for s in software:
                nameList.append(s.replace(" ", "-").lower())
            '''
            formatstring = "<li class=\"cat-item cat-item-3\"><details><summary><label>" + format + "</label></summary><ul class=\"children\">"
            for s in software:
                formatstring += "<li class=\"cat-item cat-item-3\"><a href=\"#" + s.replace(" ",
                                                                                            "-").lower() + "\">" + s + "</a></li></br>"
            '''
            formatstring += " ".join(
                nameList) + "\">" + format + "</input></li>"
            list.append(formatstring)

        html = html.replace("<!--{" + key + "}-->", " ".join(list))

    return html

예제 #30

0

파일 보기

파일: server.py 프로젝트: CSDTs/photogate

def _quote_html(html):
    return html.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

예제 #31

0

파일 보기

파일: server.py 프로젝트: LPRD/build_tools

def _quote_html(html):
    return html.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")

예제 #32

0

파일 보기

파일: textviewer.py 프로젝트: goncaloperes/Scraping_Facepager

 def autoBrText(self,html):
     return html.replace('\n', '<br />')

예제 #33

0

파일 보기

파일: server.py 프로젝트: shyamalschandra/sims4-ai-engine

def _quote_html(html):
    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

예제 #34

0

파일 보기

파일: server.py 프로젝트: johndpope/sims4-ai-engine

def _quote_html(html):
    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

예제 #35

0

파일 보기

파일: views.py 프로젝트: dp419936514/FFXIVBOT

def quest_tooltip(req):
    quest_id = req.GET.get("id", 0)
    nocache = req.GET.get("nocache", "False") == "True"
    res_type = req.GET.get("type", "web")
    print("quest_id:{}".format(quest_id))
    try:
        if quest_id:
            try:
                quest = PlotQuest.objects.get(id=quest_id)
            except PlotQuest.DoesNotExist:
                return HttpResponse("No such quest", status=500)
            else:
                if res_type == "web":
                    if quest.tooltip_html == "" or nocache:
                        r = requests.get(
                            "https://ff14.huijiwiki.com/ff14/api.php?format=json&action=parse&disablelimitreport=true&prop=text&title=%E9%A6%96%E9%A1%B5&smaxage=86400&maxage=86400&text=%7B%7B%E4%BB%BB%E5%8A%A1%2F%E6%B5%AE%E5%8A%A8%E6%91%98%E8%A6%81%7C{}%7D%7D"
                            .format(quest_id))
                        r_json = r.json()
                        # print(r_json)
                        html = r_json["parse"]["text"]["*"]
                        html = html.replace(
                            "class=\"tooltip-item\"",
                            "class=\"tooltip-item\" id=\"tooltip\"", 1)
                        html = html.replace(
                            "href=\"/", "href=\"https://ff14.huijiwiki.com/")
                        soup = BeautifulSoup(html, 'html.parser')
                        quest_name = soup.p.span.string
                        a = soup.new_tag(
                            'a',
                            href=
                            'https://ff14.huijiwiki.com/wiki/%E4%BB%BB%E5%8A%A1:{}'
                            .format(urllib.parse.quote(quest_name)))
                        a.string = quest_name
                        soup.p.span.string = ""
                        soup.p.span.append(a)
                        html = str(soup)
                        quest.tooltip_html = html
                        quest.save(update_fields=["tooltip_html"])
                    else:
                        html = quest.tooltip_html
                    return ren2res("quest_tooltip.html", req,
                                   {"parsed_html": html})
                elif res_type == "img" or res_type == "image":
                    return HttpResponse("TODO", status=500)
                    from selenium import webdriver
                    options = webdriver.ChromeOptions()
                    options.add_argument('--kiosk')
                    options.add_argument('--headless')
                    options.add_argument('--no-sandbox')
                    options.add_argument('--disable-gpu')
                    driver = webdriver.Chrome(chrome_options=options)
                    driver.get(
                        "https://xn--v9x.net/quest/tooltip/?id={}".format(
                            quest_id))
                    tooltip = driver.find_element_by_id("tooltip")
                    valid_image = "tooltip.png"
                    if tooltip.screenshot(valid_image):
                        try:
                            with open(valid_image, "rb") as f:
                                return HttpResponse(f.read(),
                                                    content_type="image/png")
                        except IOError:
                            red = Image.new('RGBA', (1, 1), (255, 0, 0, 0))
                            response = HttpResponse(content_type="image/png")
                            red.save(response, "PNG")
                            return response
                    else:
                        return HttpResponse("Image save failed", status=500)
    except KeyError:
        return HttpResponse("KeyError", status=500)
    return HttpResponse(status=500)

예제 #36

0

파일 보기

파일: manga.py 프로젝트: andy29485/Manga

def goodmanga(url, download_chapters):
  html  = get_html(url)
  global last

  series    = title(re.search('<h1>([^<>]*?)</h1>', html.replace('\n', '')).group(1))
  status    = re.search('<span>Status:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1)
  author    = re.search('<span>Authors?:</span>\\s*(.*?)\\s*</div>', html.replace('\n', '')).group(1)
  tags      = re.findall('<a.*?>(.*?)</a>', re.search('<span>Genres:</span>(.*?)\\s*</div>', html, re.DOTALL|re.MULTILINE).group(1))
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])
  chapters  = []

  while True:
    for j in re.findall('<li>\\s*(.{1,300}?\\d{4}</span>)\\s*</li>', html, re.DOTALL|re.MULTILINE):
      match = re.search('<a.*?>.*?([\\d,.]+)\\s*</a>\\s*<span.*?>\\s*(.*?)\\s*</span>', j, re.DOTALL|re.MULTILINE)
      name  = match.group(2)
      num   = float(''.join(match.group(1).rsplit('.', match.group(1).count('.')-1)))
      link  = re.search('href=\"(.*?)\"', j).group(1)
      try:
        date  = datetime.strptime(re.search('([A-Za-z]*? \\d{1,2}, \\d{4})</span>', j).group(1), '%b %d, %Y').strftime('%Y-%m-%d')
      except:
        date  = datetime.datetime.today().strftime('%Y-%m-%d')

      if name:
        name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
      else:
        name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

      if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
        logger.info('  Gathering info: \"{}\"'.format(name))
        chap_html = get_html(link)
        img_url = re.sub(
                  '1.([jpgnig]{3})',
                  '{}.\\1',
                  re.search(
                    '</div>\\s*<a.*?>\\s*<img[^<]*?src=\"(.*?)\".*?>\\s*</a>',
                    chap_html, re.DOTALL|re.MULTILINE
                  ).group(1)
        )
        found_pages = re.findall(
                          '<option value=\"(.*?)\".*?>\\s*(\\d+)\\s*</option>',
                          chap_html
        )
        pages   = max([int(i) for _,i in found_pages])
        b_links = {float(i[1]):i[0] for i in found_pages}
        b_links = [b_links[i+1] for i in range(pages)]
        links   = [img_url.format(i+1) for i in range(pages)]

        chapters.insert(0,
          {
            'name':name, 'links':links,
            'backup_links':b_links,
            'date':date, 'pages':pages,
            'num':num
          }
        )
    match   = re.search('<a href=\"(.*?)\">Next</a>', html)
    if match:
      html  = get_html(match.group(1))
    else:
      break

  if chapters:
    function_name(chapters, series, tags, author, status)

예제 #37

0

파일 보기

def remove_unnecessary_white_space(html):
    html = html.replace('<p><br/></p>', '')
    html = re.sub('&nbsp;', ' ', html)
    html = re.sub(' +', ' ', html)
    return blankpara_regex.sub(r'', html)

예제 #38

0

파일 보기

파일: manga.py 프로젝트: andy29485/Manga

def batoto(url, download_chapters):
  login_batoto()
  for i in range(3):
    try:
      html  = get_html(url+'/')
      break
    except:
      if i == 2:
        raise
      else:
        pass

  global last
  global session

  series    = title(re.search('<h1.*?>[\\s\n]*(.*?)[\\s\n]*</h1>', html, re.DOTALL|re.MULTILINE).group(1))
  status    = re.search('<td.*?>Status:</td>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1)
  author    = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<td.*?>\\s*Authors?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)))
  tags      = re.findall('<a.*?>\\s*<span.*?>\\s*([A-Za-z]*?)\\s*</span>\\s*</a>', re.search('<td.*?>\\s*Genres?\\s*:?\\s*</td>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])

  chapters  = []

  for j in re.findall('<tr class=\"row lang_([A-Za-z]*?) chapter_row\".*?>(.*?)</tr>', html, re.DOTALL|re.MULTILINE)[::-1]:
    if j[0]  == lang:
      match  = re.search('<a href=\"([^\"]*?)\".*?>\\s*<img.*?>\\s*([^\"<>]*)(\\s*:\\s*)?(.*?)\\s*</a>', j[1], re.DOTALL|re.MULTILINE)
      name   = match.group(4)
      m2     = re.search('[Cc]h(ap)?(ter)?\\.?\\s*([Ee]xtras?:?)?\\s*[\\.:-]?\\s*([\\d\\.,]+)?\\s*(-\\s*[\\d\\.]+)?', match.group(2))
      try:
        if m2.group(3):
          num = 0
        else:
          num = m2.group(4).replace(',', '.')
          num = float(''.join(num.rsplit('.', num.count('.')-1)))
      except:
        logger.debug(j[1])
        raise

      '''
      #TODO
      if m2.group(3):
        if chapters:
          num = chapters[-1]['num'] + .4
        else:
          num = last + .4
      '''
      try:
        vol  = int(re.search('[Vv]ol(ume)?\\.\\s*(\\d+)', match.group(2)).group(2))
      except:
        vol  = 0
      link   = match.group(1)
      uuid   = link.rpartition('#')[2]
      ref    = link.rpartition('/')[0]+'/' + "reader#" + uuid + "_1"
      head   = {'Referer':ref, 'supress_webtoon':'t'}
      link   = link.rpartition('/')[0]+'/'+ 'areader?id='+uuid+'&p=1'
      session.headers.update(head)

      try:
        date = datetime.strptime(re.search('<td.*?>(\\d{2} [A-Za-z]* \\d{4}.*?([Aa][Mm]|[Pp][Mm])).*?</td>', j[1]).group(1), '%d %B %Y - %I:%M %p').strftime('%Y-%m-%dT%H:%M:00')
      except:
        try:
          t  = re.search('(\\d+) [Mm]inutes ago', j[1]).group(1)
        except:
          t  = '1' if re.search('A minute ago', j[1]) else ''
        if t:
          unit = '%M'
        else:
          try:
            t  = re.search('(\\d+) [Hh]ours ago', j[1]).group(1)
          except:
            t  = '1' if re.search('An hour ago', j[1]) else ''
          if t:
            unit = '%H'
          else:
            try:
              t  = re.search('(\\d+) [Dd]ays ago', j[1]).group(1)
            except:
              t  = '1' if re.search('A day ago', j[1]) else ''
            if t:
              unit = '%d'
            else:
              try:
                t  = re.search('(\\d+) [Ww]eeks ago', j[1]).group(1)
              except:
                t  = '1' if re.search('A week ago', j[1]) else ''
              if t:
                unit = '%W'
              else:
                t = '0'
                unit = '%M'
        date = datetime.fromtimestamp((datetime.today()-datetime.strptime(t, unit)).total_seconds()).strftime('%Y-%m-%dT%H:%M:00')

      if name:
        name = '{} - {} : {}'.format(series, '{:3.1f}'.format(num).zfill(5), name)
      else:
        name = '{} - {}'.format(series, '{:3.1f}'.format(num).zfill(5))

      if (download_chapters and num in download_chapters) or (not download_chapters and num > last):
        logger.info('  Gathering info: \"{}\"'.format(name))
        chap_html = get_html(link)
        img_url   = re.sub('001\\.([A-Za-z]{3})', '{:03}.\\1', re.search('<div.*?>\\s*<a.*?>\\s*<img[^<]*?src=\"([^\"]*?)\"[^>]*?/>\\s*</div>', chap_html, re.DOTALL|re.MULTILINE).group(1))
        zero = False
        if '{:03}' not in img_url:
          img_url  = re.sub('000\\.([A-Za-z]{3})', '{:03}.\\1', img_url)
          zero = True
          if '{:03}' not in img_url:
            img_url  = re.sub('01\\.([A-Za-z]{3})', '{:02}.\\1', img_url)
            zero = False
            if '{:02}' not in img_url:
              img_url  = re.sub('00\\.([A-Za-z]{3})', '{:02}.\\1', img_url)
              zero = True
        if re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html):
          pages      = max([int(i) for i in re.findall('<option value=\".*?\".*?>page (\\d+)</option>', chap_html)])
        else:
          continue
        b_links    = {float(i[1]):i[0] for i in re.findall('<option value=\"(.*?)\".*?>page (\\d+)</option>', chap_html)}
        b_links    = [b_links[i+1] for i in range(pages)]
        if zero:
          links      = [img_url.format(i) for i in range(pages)]
        else:
          links      = [img_url.format(i+1) for i in range(pages)]

        chapters.append({'name':name, 'links':links, 'backup_links':b_links, 'date':date, 'pages':pages, 'num':num})

  if chapters:
    function_name(chapters, series, tags, author, status)

예제 #39

0

파일 보기

def get_table_html_from_dataframe(df):
    html = df.to_html(classes='table table-bordered table-responsive table-striped')
    return html.replace('<thead>', '<thead class="thead-light">')

예제 #40

0

파일 보기

def to_oneline(html):
    return html.replace("<br />", ' ').replace("</p><p>",
                                               '  ').replace('\n', '\\n')

예제 #41

0

파일 보기

파일: manga.py 프로젝트: andy29485/Manga

def mangadex(url, download_chapters, pageNum=None):
  login_mangadex()
  if url.endswith('/'):
    url = re.sub('/+$', '', url)
  if pageNum is not None:
    url += '/chapters/{page}/'.format(page=pageNum)
  for i in range(3):
    try:
      html  = get_html(url)
      break
    except:
      if i == 2:
        raise
      else:
        pass

  global last
  global session

  try:
    series = title(re.sub('<[^>]+>', '', re.search('<h3 class="panel-title">(.*)</h3>', html).group(1)).strip())
    status = re.search('<th.*?>Pub. status:</th>\\s*<td>\\s*(.*?)\\s*</td>', html.replace('\n', '')).group(1)
    author = ', '.join(re.findall('<a.*?>(.*?)</a>', re.search('<th.*?>\\s*Authors?\\s*:?\\s*</th>\\s*<td>(.*?)</td>', html.replace('\n', '')).group(1)))
    tags   = re.findall(r'<span.*?>\s*<a.*?>\s*([A-Za-z]*?)\s*</a>\s*</span>', re.search(r'<th.*?>\s*Genres?\s*:?\s*</th>\s*<td>(.*?)</td>', html.replace('\n', '')).group(1))
  except:
    logger.exception('url: %s', url)
    raise
  for j in range(len(tags)):
    for k in tag_dict:
      tags[j] = re.sub(k, tag_dict[k], tags[j])

  chapters = []

  max_page = re.search(
    'href=["\'][^"\']+?\\D(\\d+)/?[\'"][^<>]*?>\\s*<[^<>]*?Jump to last page',
    html
  )
  if pageNum == None and max_page:
    max_page = int(max_page.group(1))
    for page in range(max_page, 0, -1):
      chapters.extend(mangadex(url, download_chapters, page))
  else:
    found_chaps = re.findall(
                  r'<td>\s*(<a[^>]+href=./chapter/.*?)</tr>',
                  html, re.DOTALL|re.MULTILINE
    )
    for j in found_chaps[::-1]:
      if lang in j:
        try:
          match  = re.search(
                      r'<a[^>]+href=\"([^\"]*?)\".*?>\s*(.*?)\s*</a>',
                      j, re.DOTALL|re.MULTILINE
          )
          m2     = re.search(
                      r'([Cc]h(ap)?(ter)?\.?|([Ee]xtra|[Ss]pecial)s?:?)' + \
                      r'\s*[\.:-]?\s*([\d\.,]+)?\s*(-\s*[\d\.]+)?',
                     match.group(2)
          )
          name   = match.group(2).replace(
                      m2.group(0) if m2 else match.group(2),
                      ''
          )
          logger.debug('found chapter: %s', match.group(2))

          if not m2 or m2.group(4):
            num = 0
          else:
            num = m2.group(5).replace(',', '.')
            num = float(''.join(num.rsplit('.', num.count('.')-1)))
        except:
          logger.debug(j)
          raise

        '''
        #TODO
        if m2.group(3):
          if chapters:
            num = chapters[-1]['num'] + .4
          else:
            num = last + .4
        '''
        try:
          vol  = re.search(r'[Vv]ol(ume)?\.?\s*(\d+)', match.group(2))
          name = name.replace(vol.group(0), '').strip()
          name = re.sub(r'^\s*-? ?(Read On[ -]?line)?\s*', '', name, re.I)
          vol  = int(vol.group(2))
        except:
          vol  = 0
        link   = 'https://mangadex.com/{}/'.format(match.group(1))

        date = re.search(
           'datetime=\"(.*?)( [A-Z]{3})?\"', j
        ).group(1).replace(' ', 'T')

        strNum = '{:3.2f}'.format(num).zfill(5)
        if name:
          name = '{} - {} : {}'.format(series, strNum, name)
        else:
          name = '{} - {}'.format(series, strNum)

        if (download_chapters and num in download_chapters) \
            or (not download_chapters and num > last):
          logger.info('  Gathering info: \"{}\"'.format(name))
          img_url = ''
          for ntry in range(3):
            try:
              chap_html = get_html(link+'1')
              img_url = re.search(
                          '<img[^<]*?id=\"current_page\".*?src=\"([^\"]*?)\"',
                          chap_html, re.DOTALL|re.MULTILINE
              ).group(1)
              break
            except:
              if ntry == 2:
                raise
          logger.debug('original url: %s', img_url)
          img_url = re.sub('(/?)0*[01]\\.([A-Za-z]{3})$', r'\1{}.\2', img_url)
          if 'http' not in img_url:
            img_url = 'https://mangadex.com/' + img_url
          zero = False
          if '{' not in img_url:
            img_url = re.sub(r'(/?)0\.([a-zA-Z]{3})', r'\1{}.\2', img_url)
            zero = True
          if '{' not in img_url:
            img_url = re.sub(r'(/?)01\.([a-zA-Z]{3})', r'\1{:02}.\2', img_url)
            zero = False
          if '{' not in img_url:
            img_url = re.sub('0*1\\.([A-Za-z]{3})', r'{:02}.\1', img_url)
            zero = False
          if '{' not in img_url:
            img_url = re.sub('0*0\\.([A-Za-z]{3})', r'{:02}.\1', img_url)
            zero = True
          logger.debug('general  url: %s', img_url)

          found_pages = re.findall(
            r'<option[^>]+value=[\"\'](.*?)[\'\"].*?>Page (\d+)</option>',
            chap_html
          )
          if found_pages:
            pages = max([int(i) for _,i in found_pages])
          else:
            continue
          b_links = {int(i[1]):link+i[0] for i in found_pages}
          b_links = sorted(b_links.items(), key=lambda t: t[0])
          b_links = ['https://mangadex.com/'+i for _,i in b_links]
          if zero:
            links = [img_url.format(i) for i in range(pages)]
          else:
            links = [img_url.format(i+1) for i in range(pages)]

          chapters.append({
              'name':name, 'links':links,
              'backup_links':b_links, 'date':date,
              'pages':pages, 'num':num,
          })

  if pageNum is not None:
    return chapters
  elif chapters:
    function_name(chapters, series, tags, author, status)