Пример #1
0
def dump_doc(url, out_dir, index, dry_run=False):
    out_file_path = regex.sub("/([^/]+).htm", "/%03d_\\1.md" % index, url)
    out_file_path = os.path.join(out_dir, out_file_path)
    if os.path.exists(out_file_path):
        logging.info("Skipping Dumping %s to %s", url, out_file_path)
        return
    logging.info("Dumping %s to %s", url, out_file_path)
    full_url = "http://www.ramakrishnavivekananda.info/vivekananda/" + url
    soup = scraping.get_soup(full_url)
    metadata = {}
    title_elements = soup.select("h2")
    if len(title_elements) > 0:
        metadata["title"] = title_elements[0].text
    else:
        metadata["title"] = regex.sub("/([^/]+).htm", "\\1",
                                      url).replace("_", " ")
    body_element = soup.select("body")
    if len(body_element) == 0:
        logging.warning("Could not get text form %s with soup", full_url)
        filehandle = urllib.request.urlopen(full_url)
        content = filehandle.read().decode("utf8")
        filehandle.close()
    else:
        content = body_element[0].decode_contents()
    md_file = md_helper.MdFile(file_path=out_file_path)
    md_file.import_content_with_pandoc(content=content,
                                       source_format="html",
                                       dry_run=dry_run,
                                       metadata=metadata)
Пример #2
0
def dump_text_from_element(url,
                           outfile_path,
                           text_css_selector,
                           title_maker,
                           title_prefix="",
                           html_fixer=None,
                           index=None,
                           dry_run=False):
    logging.info("Dumping: %s to %s", url, outfile_path)
    html = get_html(url=url)
    unaltered_soup = BeautifulSoup(html, 'html.parser')
    soup = BeautifulSoup(html, 'html.parser')

    if html_fixer is not None:
        html_fixer(soup)

    metadata = {"title": title_maker(soup, title_prefix, index)}

    # We definitely want to return the original html even if the file exists - we may need to navigate to the next element.
    if os.path.exists(outfile_path):
        logging.info("Skipping dumping: %s to %s", url, outfile_path)
        return unaltered_soup

    content = content_from_element(soup=soup,
                                   text_css_selector=text_css_selector,
                                   url=url)

    md_file = md_helper.MdFile(file_path=outfile_path)
    md_file.import_content_with_pandoc(content=content,
                                       source_format="html",
                                       dry_run=dry_run,
                                       metadata=metadata)

    logging.info("Done: %s to %s", url, outfile_path)
    return unaltered_soup
Пример #3
0
def dump_text_from_element(url,
                           outfile_path,
                           text_css_selector,
                           title_css_selector=None,
                           heading_class=None):
    if os.path.exists(outfile_path):
        logging.info("Skipping dumping: %s to %s", url, outfile_path)
        return
    logging.info("Dumping: %s to %s", url, outfile_path)
    browser.get(url)
    os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
    text_elements = browser.find_elements_by_css_selector(text_css_selector)
    with open(outfile_path, "w") as outfile:
        for text_element in text_elements:
            text = text_element.text + "\n"
            if heading_class is not None and text_element.get_attribute(
                    "class") == heading_class:
                outfile.writelines("\n\n## %s\n" % text)
            else:
                outfile.writelines(text.replace("\n", "  \n"))

    if title_css_selector is not None:
        try:
            title_element = browser.find_element_by_css_selector(
                title_css_selector)
            title = title_element.text.strip()
        except NoSuchElementException:
            title = "UNKNOWN_TITLE"
        md_file = md_helper.MdFile(file_path=outfile_path)
        md_file.set_title(title=title, dry_run=False)

    logging.info("Done: %s to %s", url, outfile_path)
Пример #4
0
def get_item(id, dir_path):
    import urllib.parse
    dashaka_id = "नारायणीयम्/दशकम्_%s" % sanscript.transliterate(
        str(id), sanscript.SLP1, sanscript.DEVANAGARI)
    logging.info(dashaka_id)
    item_url = "https://sa.wikisource.org/wiki/" + urllib.parse.quote(
        dashaka_id)
    logging.info(item_url)
    browser.get(item_url)
    text = browser.find_element_by_css_selector("div.poem").text
    text = text.replace("cअ", "च").replace("cइ", "चि").replace(
        "cई", "ची").replace("cउ", "चु").replace("cऊ", "चू").replace(
            "cऋ", "चृ").replace("cॠ", "चॄ").replace("cऌ", "चॢ").replace(
                "cॡ", "चॣ").replace("cए", "चे").replace("cऐ", "चै").replace(
                    "cओ", "चो").replace("cऔ",
                                        "चौ").replace("c",
                                                      "च्").replace("ळ", "ल")
    shlokas = text.split("\n\n")
    outfile_path = os.path.join(dir_path, "%03d.md" % id)
    os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
    with open(outfile_path, "w") as outfile:
        for shloka_id in range(1, len(shlokas) + 1):
            outfile.write(
                "<div class=\"audioEmbed\"  caption=\"सीतालक्ष्मी-वाचनम्\" src=\"https://sanskritdocuments.org/sites/completenarayaneeyam/SoundFiles/%03d/%03d_%02d.mp3\"></div>  \n"
                % (id, id, shloka_id))
            outfile.writelines(shlokas[shloka_id - 1].replace("\n", "  \n") +
                               "\n\n")
    md_file = md_helper.MdFile(file_path=outfile_path)
    md_file.set_title(sanscript.transliterate("%03d" % id, sanscript.SLP1,
                                              sanscript.DEVANAGARI),
                      dry_run=False)
Пример #5
0
def dump_text(browser, outdir):
    text_name = deduce_text_name(browser)
    out_file_path = get_output_path(text_name=text_name, outdir=outdir)
    text_spans = browser.find_elements_by_css_selector(
        "#gvResults tr[valign=\"top\"] td span")
    text_segments = [
        span.text.strip().replace("\n", "  \n") for span in text_spans
    ]
    text = "\n\n".join(text_segments)
    md_file = md_helper.MdFile(file_path=out_file_path)
    md_file.dump_to_file(metadata={"title": text_name}, md=text, dry_run=False)
Пример #6
0
def dump_content(soup, out_file_path, metadata, dry_run):
    content_elements = soup.select("td[width=\"60%\"]") + soup.select(
        "td[width=\"80%\"]") + soup.select("body")
    content = fix_text(content_elements[0].decode_contents(formatter="html"))
    md_file = md_helper.MdFile(file_path=out_file_path)
    md_file.import_content_with_pandoc(content=content,
                                       source_format="html",
                                       dry_run=dry_run,
                                       metadata=metadata)
    if metadata == {}:
        md_file.set_title_from_filename(transliteration_target=None,
                                        dry_run=dry_run)
Пример #7
0
def dump_text(browser, title, out_file_path):
    previous_page_text = None
    text = ""
    while True:
        page_text = get_page_text(browser=browser)
        if previous_page_text == page_text:
            break
        logging.debug(page_text)
        text = text + page_text
        time.sleep(2)
        selenium.click_link_by_text(browser=browser, element_text="Next")
        previous_page_text = page_text

    md_file = md_helper.MdFile(file_path=out_file_path)
    md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
Пример #8
0
def dump_item(title, item_url, outfile_path, get_collapsible_content):
    if os.path.exists(outfile_path):
        logging.info("skipping: %s - it exists already", outfile_path)
        return
    logging.info(item_url)
    browser.get(item_url)
    text = ""
    if not get_collapsible_content:
        try:
            text = browser.find_element_by_css_selector("div.poem").text
        except NoSuchElementException:
            content_element = browser.find_element_by_css_selector(
                ".mw-parser-output")
            para_elements = content_element.find_elements_by_tag_name("p")
            text = "\n\n".join(map(lambda x: x.text, para_elements))
    else:
        text = browser.find_element_by_css_selector(
            ".mw-collapsible-content").text
    os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
    with open(outfile_path, "w") as outfile:
        outfile.writelines(text.replace("\n", "  \n"))
    md_file = md_helper.MdFile(file_path=outfile_path)
    md_file.set_title(title=title, dry_run=False)