def dump_doc(url, out_dir, index, dry_run=False): out_file_path = regex.sub("/([^/]+).htm", "/%03d_\\1.md" % index, url) out_file_path = os.path.join(out_dir, out_file_path) if os.path.exists(out_file_path): logging.info("Skipping Dumping %s to %s", url, out_file_path) return logging.info("Dumping %s to %s", url, out_file_path) full_url = "http://www.ramakrishnavivekananda.info/vivekananda/" + url soup = scraping.get_soup(full_url) metadata = {} title_elements = soup.select("h2") if len(title_elements) > 0: metadata["title"] = title_elements[0].text else: metadata["title"] = regex.sub("/([^/]+).htm", "\\1", url).replace("_", " ") body_element = soup.select("body") if len(body_element) == 0: logging.warning("Could not get text form %s with soup", full_url) filehandle = urllib.request.urlopen(full_url) content = filehandle.read().decode("utf8") filehandle.close() else: content = body_element[0].decode_contents() md_file = md_helper.MdFile(file_path=out_file_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata)
def dump_text_from_element(url, outfile_path, text_css_selector, title_maker, title_prefix="", html_fixer=None, index=None, dry_run=False): logging.info("Dumping: %s to %s", url, outfile_path) html = get_html(url=url) unaltered_soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser') if html_fixer is not None: html_fixer(soup) metadata = {"title": title_maker(soup, title_prefix, index)} # We definitely want to return the original html even if the file exists - we may need to navigate to the next element. if os.path.exists(outfile_path): logging.info("Skipping dumping: %s to %s", url, outfile_path) return unaltered_soup content = content_from_element(soup=soup, text_css_selector=text_css_selector, url=url) md_file = md_helper.MdFile(file_path=outfile_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata) logging.info("Done: %s to %s", url, outfile_path) return unaltered_soup
def dump_text_from_element(url, outfile_path, text_css_selector, title_css_selector=None, heading_class=None): if os.path.exists(outfile_path): logging.info("Skipping dumping: %s to %s", url, outfile_path) return logging.info("Dumping: %s to %s", url, outfile_path) browser.get(url) os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) text_elements = browser.find_elements_by_css_selector(text_css_selector) with open(outfile_path, "w") as outfile: for text_element in text_elements: text = text_element.text + "\n" if heading_class is not None and text_element.get_attribute( "class") == heading_class: outfile.writelines("\n\n## %s\n" % text) else: outfile.writelines(text.replace("\n", " \n")) if title_css_selector is not None: try: title_element = browser.find_element_by_css_selector( title_css_selector) title = title_element.text.strip() except NoSuchElementException: title = "UNKNOWN_TITLE" md_file = md_helper.MdFile(file_path=outfile_path) md_file.set_title(title=title, dry_run=False) logging.info("Done: %s to %s", url, outfile_path)
def get_item(id, dir_path): import urllib.parse dashaka_id = "नारायणीयम्/दशकम्_%s" % sanscript.transliterate( str(id), sanscript.SLP1, sanscript.DEVANAGARI) logging.info(dashaka_id) item_url = "https://sa.wikisource.org/wiki/" + urllib.parse.quote( dashaka_id) logging.info(item_url) browser.get(item_url) text = browser.find_element_by_css_selector("div.poem").text text = text.replace("cअ", "च").replace("cइ", "चि").replace( "cई", "ची").replace("cउ", "चु").replace("cऊ", "चू").replace( "cऋ", "चृ").replace("cॠ", "चॄ").replace("cऌ", "चॢ").replace( "cॡ", "चॣ").replace("cए", "चे").replace("cऐ", "चै").replace( "cओ", "चो").replace("cऔ", "चौ").replace("c", "च्").replace("ळ", "ल") shlokas = text.split("\n\n") outfile_path = os.path.join(dir_path, "%03d.md" % id) os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: for shloka_id in range(1, len(shlokas) + 1): outfile.write( "<div class=\"audioEmbed\" caption=\"सीतालक्ष्मी-वाचनम्\" src=\"https://sanskritdocuments.org/sites/completenarayaneeyam/SoundFiles/%03d/%03d_%02d.mp3\"></div> \n" % (id, id, shloka_id)) outfile.writelines(shlokas[shloka_id - 1].replace("\n", " \n") + "\n\n") md_file = md_helper.MdFile(file_path=outfile_path) md_file.set_title(sanscript.transliterate("%03d" % id, sanscript.SLP1, sanscript.DEVANAGARI), dry_run=False)
def dump_text(browser, outdir): text_name = deduce_text_name(browser) out_file_path = get_output_path(text_name=text_name, outdir=outdir) text_spans = browser.find_elements_by_css_selector( "#gvResults tr[valign=\"top\"] td span") text_segments = [ span.text.strip().replace("\n", " \n") for span in text_spans ] text = "\n\n".join(text_segments) md_file = md_helper.MdFile(file_path=out_file_path) md_file.dump_to_file(metadata={"title": text_name}, md=text, dry_run=False)
def dump_content(soup, out_file_path, metadata, dry_run): content_elements = soup.select("td[width=\"60%\"]") + soup.select( "td[width=\"80%\"]") + soup.select("body") content = fix_text(content_elements[0].decode_contents(formatter="html")) md_file = md_helper.MdFile(file_path=out_file_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata) if metadata == {}: md_file.set_title_from_filename(transliteration_target=None, dry_run=dry_run)
def dump_text(browser, title, out_file_path): previous_page_text = None text = "" while True: page_text = get_page_text(browser=browser) if previous_page_text == page_text: break logging.debug(page_text) text = text + page_text time.sleep(2) selenium.click_link_by_text(browser=browser, element_text="Next") previous_page_text = page_text md_file = md_helper.MdFile(file_path=out_file_path) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
def dump_item(title, item_url, outfile_path, get_collapsible_content): if os.path.exists(outfile_path): logging.info("skipping: %s - it exists already", outfile_path) return logging.info(item_url) browser.get(item_url) text = "" if not get_collapsible_content: try: text = browser.find_element_by_css_selector("div.poem").text except NoSuchElementException: content_element = browser.find_element_by_css_selector( ".mw-parser-output") para_elements = content_element.find_elements_by_tag_name("p") text = "\n\n".join(map(lambda x: x.text, para_elements)) else: text = browser.find_element_by_css_selector( ".mw-collapsible-content").text os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(text.replace("\n", " \n")) md_file = md_helper.MdFile(file_path=outfile_path) md_file.set_title(title=title, dry_run=False)