def dump_doc(url, out_dir, index, dry_run=False): out_file_path = regex.sub("/([^/]+).htm", "/%03d_\\1.md" % index, url) out_file_path = os.path.join(out_dir, out_file_path) if os.path.exists(out_file_path): logging.info("Skipping Dumping %s to %s", url, out_file_path) return logging.info("Dumping %s to %s", url, out_file_path) full_url = "http://www.ramakrishnavivekananda.info/vivekananda/" + url soup = scraping.get_soup(full_url) metadata = {} title_elements = soup.select("h2") if len(title_elements) > 0: metadata["title"] = title_elements[0].text else: metadata["title"] = regex.sub("/([^/]+).htm", "\\1", url).replace("_", " ") body_element = soup.select("body") if len(body_element) == 0: logging.warning("Could not get text form %s with soup", full_url) filehandle = urllib.request.urlopen(full_url) content = filehandle.read().decode("utf8") filehandle.close() else: content = body_element[0].decode_contents() md_file = md_helper.MdFile(file_path=out_file_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata)
def get_text(url): logging.info("Processing %s", url) soup = scraping.get_soup(url=url) content = soup.select("pre")[0].text content = sanscript.transliterate(data=content, _from=sanscript.IAST, _to=sanscript.DEVANAGARI) return content
def dump_docs(out_dir, dry_run=False): index_url = "https://www.ramakrishnavivekananda.info/vivekananda/master_index.htm" soup = scraping.get_soup(index_url) links = soup.select("a") for index, link in enumerate(links[3:]): dump_doc(url=link["href"], out_dir=out_dir, index=index, dry_run=dry_run)
def get_docs(out_dir): soup = scraping.get_soup( "https://etexts.muktabodha.org/DL_CATALOG_USER_INTERFACE/dl_user_interface_list_catalog_records.php?sort_key=title" ) links = soup.select("a") for link in links: url = "https://etexts.muktabodha.org/DL_CATALOG_USER_INTERFACE/%s" % link[ "href"] process_catalog_page_selenium(url=url, out_dir=out_dir)
def get_html(url): soup = scraping.get_soup(url) body_element = soup.select("body") if len(body_element) == 0: logging.warning("Could not get text form %s with soup", url) filehandle = urllib.request.urlopen(url) content = filehandle.read().decode("utf8") filehandle.close() else: content = body_element[0].decode_contents() return content
def dump_all_texts(dest_dir, overwrite=False): soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/") links = soup.select("div.wp-block-group a") for link in links: (title, text) = get_text(link["href"]) filename = file_helper.clean_file_path("%s.md" % title) dest_path = os.path.join(dest_dir, filename) if not overwrite and os.path.exists(dest_path): logging.warning("Skipping %s since it exists", dest_path) continue logging.info("Getting %s", link["href"]) md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
def dump_doc(url, out_dir, index, dry_run=False): out_file_path = regex.sub(".+/([^/]+).html?", "%02d_\\1.md" %index, url) out_file_path = os.path.join(out_dir, out_file_path) if os.path.exists(out_file_path): logging.info("Skipping Dumping %s to %s", url, out_file_path) return logging.info("Dumping %s to %s", url, out_file_path) soup = scraping.get_soup(url) if "Not Found" in soup.text: logging.warning("%s not found!", url) return metadata = get_metadata(soup=soup, index=index, url=url) dump_content(soup=soup, out_file_path=out_file_path, metadata=metadata, dry_run=dry_run)
def dump_docs(index_url, out_dir, dry_run=False): soup = scraping.get_soup(index_url) out_file_path = os.path.join(out_dir, "_index.md") if not os.path.exists(out_file_path): dump_content(soup=soup, out_file_path=out_file_path, metadata={}, dry_run=dry_run) links = soup.select("a") for index, link in enumerate(links): href = link.get("href", None) text = fix_text(link.text) if href and "Back to" not in text and href not in ["http://voiceofdharma.org"]: if href.startswith("http"): url = link["href"] else: url = index_url + link["href"] dump_doc(url= url, out_dir=out_dir, index=index, dry_run=dry_run)
def get_text(url): soup = scraping.get_soup(url=url) text = soup.select_one("div.entry-content").text text = md_helper.markdownify_plain_text(text) title = regex.sub("[ -]*आदिशिला", "", soup.title.string).strip() return (title, text)
def process_catalog_page_soup(url): """Does not work - get template content which is different from actual view in browser.""" soup = scraping.get_soup(url=url)