def dump_text(base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/vAjasaneyi/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): logging.info("adhyAya %d", kaanda_index) outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index)) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % ( kaanda_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) try: text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title = "%02d" % kaanda_index title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False) except NoSuchElementException: logging.warning("Page missing! %s ", url)
def dump_devanaagarii(source_html, dest_file): if os.path.exists(dest_file): logging.warning("Skipping %s as it exists", dest_file) return logging.info("Processing %s to %s", source_html, dest_file) with codecs.open(source_html, "r", 'utf-8') as file_in: contents = file_in.read() soup = BeautifulSoup(contents, 'lxml') metadata = {} metadata["title"] = soup.title.text.strip() lines = soup.text.split("\n") english_lines = itertools.takewhile( lambda x: x.strip() != "http://gretil.sub.uni-goettingen.de/gretil.htm", lines) intro = "\n\n## Intro\n%s" % (" \n".join(english_lines)) iast_lines = itertools.dropwhile( lambda x: x.strip() != "http://gretil.sub.uni-goettingen.de/gretil.htm", lines) text = " \n".join(list(iast_lines)[1:]) text = regex.sub("( \n){3,}", "\n\n", text) text = sanscript.transliterate(data=text, _from=sanscript.IAST, _to=sanscript.DEVANAGARI) text = "%s\n\n## पाठः\n%s" % (intro, text) out_file = MdFile(file_path=dest_file, frontmatter_type="toml") out_file.dump_to_file(metadata=metadata, md=text, dry_run=False)
def dump_ics_md_pair(panchaanga, period_str): ics_calendar = ics.compute_calendar(panchaanga) (year_type, year) = period_str.split("/") year = int(year) out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str(panchaanga.computation_system), year=year, year_type=year_type) output_file_ics = os.path.join(out_path + ".ics") ics.write_to_file(ics_calendar, output_file_ics) md_file = MdFile(file_path=output_file_ics.replace(".ics", ".md"), frontmatter_type=MdFile.YAML) intro = "## 00 Intro\n### Related files\n- [ics](../%s)\n" % str(os.path.basename(output_file_ics)) md_content = "%s\n%s" % (intro, md.make_md(panchaanga=panchaanga)) md_file.dump_to_file(metadata={"title": year}, md=md_content, dry_run=False) monthly_file_path = md_file.file_path.replace(".md", "_monthly.md") monthly_dir = monthly_file_path.replace(".md", "/") shutil.rmtree(path=monthly_dir, ignore_errors=True) logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir)) logging.info("Copying to %s", monthly_file_path) shutil.copy(md_file.file_path, monthly_file_path) monthly_md_file = MdFile(file_path=monthly_file_path) monthly_md_file.set_title_from_filename(dry_run=False, transliteration_target=None) monthly_md_file.split_to_bits(source_script=None, dry_run=False, indexed_title_pattern=None) MdFile.apply_function(fn=MdFile.split_to_bits, dir_path=monthly_dir, frontmatter_type=MdFile.TOML, source_script=None, dry_run=False, indexed_title_pattern=None) logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir)) MdFile.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False)
def dump_text(base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/shaunaka/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): subunit_list = text_data.get_subunit_list( json_file=unit_info_file, unit_path_list=[kaanda_index]) for subunit_index in subunit_list: logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index) outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % ( kaanda_index, subunit_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title_tags = browser.find_elements_by_css_selector( "#videotext strong") title = "%03d" % subunit_index if len(title_tags) > 0: title = "%03d %s" % (subunit_index, title_tags[0].text) title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
def dump_markdown(src_file, dest_file): logging.info("Processing %s to %s", src_file, dest_file) metadata = get_metadata(src_file=src_file) text = get_text(src_file=src_file) metadata["title"] = sanscript.transliterate(data=metadata["itxtitle"], _from=sanscript.OPTITRANS, _to=sanscript.DEVANAGARI) md_file = MdFile(file_path=dest_file, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata=metadata, md=text, dry_run=False)
def dump_all_texts(dest_dir, overwrite=False): soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/") links = soup.select("div.wp-block-group a") for link in links: (title, text) = get_text(link["href"]) filename = file_helper.clean_file_path("%s.md" % title) dest_path = os.path.join(dest_dir, filename) if not overwrite and os.path.exists(dest_path): logging.warning("Skipping %s since it exists", dest_path) continue logging.info("Getting %s", link["href"]) md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
def dump_summary(year, city, script=xsanscript.DEVANAGARI, computation_system=ComputationSystem.MULTI_NEW_MOON_SIDEREAL_MONTH_ADHIKA__CHITRA_180): year_type = era.ERA_GREGORIAN logging.info("Generating summary panchaanga for %s year %d (%s), with computation system %s ", city.name, year, year_type, str(computation_system)) panchaanga = annual.get_panchaanga_for_year(city=city, year=year, computation_system=computation_system, year_type=year_type, allow_precomputed=True) year_table = to_table_dict(panchaanga=panchaanga ) out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str(panchaanga.computation_system), year=year, year_type=year_type) os.makedirs(os.path.dirname(out_path), exist_ok=True) with codecs.open(out_path + ".toml", "w") as fp: toml.dump(year_table, fp) MdFile.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False) computation_params = get_computation_parameters_md(panchaanga=panchaanga, scripts=[script]) out_path_md = out_path + "_summary.md" md = """## Intro\n%s\n\n## Table <div class="spreadsheet" src="../%s.toml" fullHeightWithRowsPerScreen=4> </div>""" % (computation_params, str(year)) md_file = MdFile(file_path=out_path_md) md_file.dump_to_file(metadata={"title": "%d Summary" % (year)}, md=md, dry_run=False)
def process_catalog_page_selenium(url, out_dir): logging.info("Processing catalog %s", url) browser.get(url=url) text_links = browser.find_elements_by_link_text("View in Unicode transliteration") if len(text_links) == 0: logging.warning("%s does not have text", url) return catalog_body = browser.find_element_by_css_selector(".catalog_record_body") metadata = get_front_matter(catalog_body.get_attribute('innerHTML')) logging.info(metadata) dest_file_path = get_file_path(out_dir=out_dir, title_iast=metadata["title_iast"], author_iast=metadata.get("author_iast", ""), catalog_number=metadata.get("Catalog number", "")) if os.path.exists(dest_file_path): logging.warning("Skipping %s - already exists.", dest_file_path) text_url = text_links[0].get_attribute("href") file = MdFile(file_path=dest_file_path, frontmatter_type="toml") text = get_text(url=text_url) text = text.replace("\n", " \n") file.dump_to_file(metadata=metadata, md=text, dry_run=False)
def transform(): json_paths = glob.glob( "/home/vvasuki/sanskrit/raw_etexts/veda/Rg/shakala/saMhitA/sAyaNabhAShyam/*/*/*.json", recursive=True) suukta_id_to_md = {} for json_path in sorted(json_paths): with codecs.open(json_path, "r") as fp: rk = json.load(fp) suukta_id = "%02d/%03d" % (int(rk["classification"]["mandala"]), int(rk["classification"]["sukta"])) suukta_md = suukta_id_to_md.get(suukta_id, "") bhaashya = regex.sub("<.+?>", "", rk["sayanaBhashya"]) rk_number = sanscript.transliterate( "%02d" % int(rk["classification"]["rik"]), sanscript.IAST, sanscript.DEVANAGARI) attribute_str = "%s। %s। %s।" % (rk["attribute"]["devata"], rk["attribute"]["rishi"], rk["attribute"]["chandas"]) padapaatha_lines = rk["padapaatha"]["lines"] if isinstance(padapaatha_lines, str): padapaatha_lines = [padapaatha_lines] samhita_lines = rk["samhitaAux"]["lines"] if isinstance(samhita_lines, str): samhita_lines = [samhita_lines] rk_md = "%s\n\n%s %s॥\n\n%s\n\n%s" % ( attribute_str, " \n".join(samhita_lines), rk_number, " \n".join(padapaatha_lines), bhaashya) suukta_md += "\n\n%s" % rk_md if bhaashya == "": logging.warning("No bhAShya for %s", rk["id"]) suukta_id_to_md[suukta_id] = suukta_md for suukta_id in suukta_id_to_md.keys(): dest_path = os.path.join(dest_dir, suukta_id + ".md") md_file = MdFile(file_path=dest_path) title = sanscript.transliterate( suukta_id.split("/")[-1], sanscript.IAST, sanscript.DEVANAGARI) md_file.dump_to_file(metadata={"title": title}, md=suukta_id_to_md[suukta_id], dry_run=False)
def test_panchanga_chennai_2019(): panchaanga_2019 = Panchaanga.read_from_file( filename=os.path.join(TEST_DATA_PATH, 'Chennai-2019.json')) # We dump to md.txt rather than md to avoid slow checks on intellij ide. orig_md_file = os.path.join(TEST_DATA_PATH, 'Chennai-2019-devanagari.md.txt') current_md_output = os.path.join(TEST_DATA_PATH, 'Chennai-2019-devanagari.md.txt.local') md_file = MdFile(file_path=current_md_output) md_file.dump_to_file(metadata={"title": str(2019)}, md=md.make_md(panchaanga=panchaanga_2019), dry_run=False) if not os.path.exists(orig_md_file): logging.warning( "%s not present. Assuming that it was deliberately deleted to update test files.", orig_md_file) md_file = MdFile(file_path=orig_md_file) md_file.dump_to_file(metadata={"title": str(2019)}, md=md.make_md(panchaanga=panchaanga_2019), dry_run=False) with open(orig_md_file) as orig_tex: with open(current_md_output) as current_tex: assert current_tex.read() == orig_tex.read()