def dump_text(base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "vedaH/vAjasaneyi/samhitA.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        logging.info("adhyAya %d", kaanda_index)

        outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index))
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
            continue

        url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % (
            kaanda_index)
        logging.info("url %s to %s", url, outfile_path)
        browser.get(url=url)
        try:
            text = browser.find_element_by_id("videotext").text
            text = text.replace("\n", "  \n")
            title = "%02d" % kaanda_index
            title = sanscript.transliterate(title, sanscript.HK,
                                            sanscript.DEVANAGARI)
            md_file = MdFile(file_path=outfile_path)
            md_file.dump_to_file(metadata={"title": title},
                                 md=text,
                                 dry_run=False)
        except NoSuchElementException:
            logging.warning("Page missing! %s ", url)
예제 #2
0
def dump_devanaagarii(source_html, dest_file):
    if os.path.exists(dest_file):
        logging.warning("Skipping %s as it exists", dest_file)
        return
    logging.info("Processing %s to %s", source_html, dest_file)
    with codecs.open(source_html, "r", 'utf-8') as file_in:
        contents = file_in.read()
        soup = BeautifulSoup(contents, 'lxml')
        metadata = {}
        metadata["title"] = soup.title.text.strip()
        lines = soup.text.split("\n")
        english_lines = itertools.takewhile(
            lambda x: x.strip() !=
            "http://gretil.sub.uni-goettingen.de/gretil.htm", lines)
        intro = "\n\n## Intro\n%s" % ("  \n".join(english_lines))
        iast_lines = itertools.dropwhile(
            lambda x: x.strip() !=
            "http://gretil.sub.uni-goettingen.de/gretil.htm", lines)
        text = "  \n".join(list(iast_lines)[1:])
        text = regex.sub("(  \n){3,}", "\n\n", text)
        text = sanscript.transliterate(data=text,
                                       _from=sanscript.IAST,
                                       _to=sanscript.DEVANAGARI)
        text = "%s\n\n## पाठः\n%s" % (intro, text)
        out_file = MdFile(file_path=dest_file, frontmatter_type="toml")
        out_file.dump_to_file(metadata=metadata, md=text, dry_run=False)
예제 #3
0
def dump_ics_md_pair(panchaanga, period_str):
  ics_calendar = ics.compute_calendar(panchaanga)
  (year_type, year) = period_str.split("/")
  year = int(year)
  out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str(panchaanga.computation_system), year=year, year_type=year_type)
  output_file_ics = os.path.join(out_path + ".ics")
  ics.write_to_file(ics_calendar, output_file_ics)

  md_file = MdFile(file_path=output_file_ics.replace(".ics", ".md"), frontmatter_type=MdFile.YAML)
  intro = "## 00 Intro\n### Related files\n- [ics](../%s)\n" % str(os.path.basename(output_file_ics))
  md_content = "%s\n%s" % (intro, md.make_md(panchaanga=panchaanga))
  md_file.dump_to_file(metadata={"title": year}, md=md_content, dry_run=False)

  monthly_file_path = md_file.file_path.replace(".md", "_monthly.md")
  monthly_dir = monthly_file_path.replace(".md", "/")
  shutil.rmtree(path=monthly_dir, ignore_errors=True)
  logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir))
  logging.info("Copying to %s", monthly_file_path)
  shutil.copy(md_file.file_path, monthly_file_path)
  monthly_md_file = MdFile(file_path=monthly_file_path)
  monthly_md_file.set_title_from_filename(dry_run=False, transliteration_target=None)
  monthly_md_file.split_to_bits(source_script=None, dry_run=False, indexed_title_pattern=None)
  MdFile.apply_function(fn=MdFile.split_to_bits, dir_path=monthly_dir, frontmatter_type=MdFile.TOML, source_script=None, dry_run=False, indexed_title_pattern=None)
  logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir))

  MdFile.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False)
예제 #4
0
def dump_text(base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "vedaH/shaunaka/samhitA.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        subunit_list = text_data.get_subunit_list(
            json_file=unit_info_file, unit_path_list=[kaanda_index])
        for subunit_index in subunit_list:
            logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index)

            outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index),
                                        "%03d.md" % subunit_index)
            if os.path.exists(outfile_path):
                logging.info("Skipping " + outfile_path)
                continue

            url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % (
                kaanda_index, subunit_index)
            logging.info("url %s to %s", url, outfile_path)
            browser.get(url=url)
            text = browser.find_element_by_id("videotext").text
            text = text.replace("\n", "  \n")
            title_tags = browser.find_elements_by_css_selector(
                "#videotext  strong")
            title = "%03d" % subunit_index
            if len(title_tags) > 0:
                title = "%03d %s" % (subunit_index, title_tags[0].text)
            title = sanscript.transliterate(title, sanscript.HK,
                                            sanscript.DEVANAGARI)
            md_file = MdFile(file_path=outfile_path)
            md_file.dump_to_file(metadata={"title": title},
                                 md=text,
                                 dry_run=False)
예제 #5
0
def dump_markdown(src_file, dest_file):
    logging.info("Processing %s to %s", src_file, dest_file)
    metadata = get_metadata(src_file=src_file)
    text = get_text(src_file=src_file)
    metadata["title"] = sanscript.transliterate(data=metadata["itxtitle"],
                                                _from=sanscript.OPTITRANS,
                                                _to=sanscript.DEVANAGARI)
    md_file = MdFile(file_path=dest_file, frontmatter_type=MdFile.TOML)
    md_file.dump_to_file(metadata=metadata, md=text, dry_run=False)
예제 #6
0
def dump_all_texts(dest_dir, overwrite=False):
    soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/")
    links = soup.select("div.wp-block-group a")
    for link in links:
        (title, text) = get_text(link["href"])
        filename = file_helper.clean_file_path("%s.md" % title)
        dest_path = os.path.join(dest_dir, filename)
        if not overwrite and os.path.exists(dest_path):
            logging.warning("Skipping %s since it exists", dest_path)
            continue
        logging.info("Getting %s", link["href"])
        md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML)
        md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False)
예제 #7
0
def dump_summary(year, city, script=xsanscript.DEVANAGARI, computation_system=ComputationSystem.MULTI_NEW_MOON_SIDEREAL_MONTH_ADHIKA__CHITRA_180):
  year_type = era.ERA_GREGORIAN
  logging.info("Generating summary panchaanga for %s year %d (%s), with computation system %s ", city.name, year, year_type, str(computation_system))
  panchaanga = annual.get_panchaanga_for_year(city=city, year=year, computation_system=computation_system, year_type=year_type, allow_precomputed=True)
  year_table = to_table_dict(panchaanga=panchaanga )
  out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str(panchaanga.computation_system), year=year, year_type=year_type)
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
  with codecs.open(out_path + ".toml", "w") as fp:
    toml.dump(year_table, fp)
  MdFile.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False)

  computation_params = get_computation_parameters_md(panchaanga=panchaanga, scripts=[script])
  out_path_md = out_path + "_summary.md"
  md = """## Intro\n%s\n\n## Table
  <div class="spreadsheet" src="../%s.toml" fullHeightWithRowsPerScreen=4> </div>""" % (computation_params, 
    str(year))
  md_file = MdFile(file_path=out_path_md)
  md_file.dump_to_file(metadata={"title": "%d Summary" % (year)}, md=md, dry_run=False)
예제 #8
0
def process_catalog_page_selenium(url, out_dir):
    logging.info("Processing catalog %s", url)
    browser.get(url=url)
    text_links = browser.find_elements_by_link_text("View in Unicode transliteration")
    if len(text_links) == 0:
        logging.warning("%s does not have text", url)
        return

    catalog_body = browser.find_element_by_css_selector(".catalog_record_body")
    metadata = get_front_matter(catalog_body.get_attribute('innerHTML'))
    logging.info(metadata)

    dest_file_path = get_file_path(out_dir=out_dir, title_iast=metadata["title_iast"], author_iast=metadata.get("author_iast", ""), catalog_number=metadata.get("Catalog number", ""))
    if os.path.exists(dest_file_path):
        logging.warning("Skipping %s - already exists.", dest_file_path)

    text_url = text_links[0].get_attribute("href")
    file = MdFile(file_path=dest_file_path, frontmatter_type="toml")
    text = get_text(url=text_url)
    text = text.replace("\n", "  \n")
    file.dump_to_file(metadata=metadata, md=text, dry_run=False)
예제 #9
0
def transform():
    json_paths = glob.glob(
        "/home/vvasuki/sanskrit/raw_etexts/veda/Rg/shakala/saMhitA/sAyaNabhAShyam/*/*/*.json",
        recursive=True)
    suukta_id_to_md = {}
    for json_path in sorted(json_paths):
        with codecs.open(json_path, "r") as fp:
            rk = json.load(fp)
            suukta_id = "%02d/%03d" % (int(rk["classification"]["mandala"]),
                                       int(rk["classification"]["sukta"]))
            suukta_md = suukta_id_to_md.get(suukta_id, "")
            bhaashya = regex.sub("<.+?>", "", rk["sayanaBhashya"])
            rk_number = sanscript.transliterate(
                "%02d" % int(rk["classification"]["rik"]), sanscript.IAST,
                sanscript.DEVANAGARI)
            attribute_str = "%s। %s। %s।" % (rk["attribute"]["devata"],
                                             rk["attribute"]["rishi"],
                                             rk["attribute"]["chandas"])
            padapaatha_lines = rk["padapaatha"]["lines"]
            if isinstance(padapaatha_lines, str):
                padapaatha_lines = [padapaatha_lines]
            samhita_lines = rk["samhitaAux"]["lines"]
            if isinstance(samhita_lines, str):
                samhita_lines = [samhita_lines]
            rk_md = "%s\n\n%s %s॥\n\n%s\n\n%s" % (
                attribute_str, "  \n".join(samhita_lines), rk_number,
                "  \n".join(padapaatha_lines), bhaashya)
            suukta_md += "\n\n%s" % rk_md
            if bhaashya == "":
                logging.warning("No bhAShya for %s", rk["id"])
            suukta_id_to_md[suukta_id] = suukta_md

    for suukta_id in suukta_id_to_md.keys():
        dest_path = os.path.join(dest_dir, suukta_id + ".md")
        md_file = MdFile(file_path=dest_path)
        title = sanscript.transliterate(
            suukta_id.split("/")[-1], sanscript.IAST, sanscript.DEVANAGARI)
        md_file.dump_to_file(metadata={"title": title},
                             md=suukta_id_to_md[suukta_id],
                             dry_run=False)
예제 #10
0
def test_panchanga_chennai_2019():
    panchaanga_2019 = Panchaanga.read_from_file(
        filename=os.path.join(TEST_DATA_PATH, 'Chennai-2019.json'))
    # We dump to md.txt rather than md to avoid slow checks on intellij ide.
    orig_md_file = os.path.join(TEST_DATA_PATH,
                                'Chennai-2019-devanagari.md.txt')
    current_md_output = os.path.join(TEST_DATA_PATH,
                                     'Chennai-2019-devanagari.md.txt.local')
    md_file = MdFile(file_path=current_md_output)
    md_file.dump_to_file(metadata={"title": str(2019)},
                         md=md.make_md(panchaanga=panchaanga_2019),
                         dry_run=False)
    if not os.path.exists(orig_md_file):
        logging.warning(
            "%s not present. Assuming that it was deliberately deleted to update test files.",
            orig_md_file)
        md_file = MdFile(file_path=orig_md_file)
        md_file.dump_to_file(metadata={"title": str(2019)},
                             md=md.make_md(panchaanga=panchaanga_2019),
                             dry_run=False)

    with open(orig_md_file) as orig_tex:
        with open(current_md_output) as current_tex:
            assert current_tex.read() == orig_tex.read()