def set_titles_from_spreadsheet(dir_path, dry_run=False): MdFile.fix_field_values( md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), spreadhsheet_id="1sNH1AWhhoa5VATqMdLbF652s7srTG0Raa6K-sCwDR-8", worksheet_name="कुम्भकोणाध्यायाः", id_column="क्रमाङ्कम्", value_column="अन्तिमशीर्षिका", md_file_to_id=mahaabhaarata.get_adhyaaya_id, dry_run=dry_run ) MdFile.devanaagarify_titles(md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), dry_run=dry_run)
def dump_text(base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/vAjasaneyi/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): logging.info("adhyAya %d", kaanda_index) outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index)) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % ( kaanda_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) try: text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title = "%02d" % kaanda_index title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False) except NoSuchElementException: logging.warning("Page missing! %s ", url)
def process_catalog_page_selenium(url, out_dir): logging.info("Processing catalog %s", url) browser.get(url=url) text_links = browser.find_elements_by_link_text( "View in Unicode transliteration") if len(text_links) == 0: logging.warning("%s does not have text", url) return catalog_body = browser.find_element_by_css_selector(".catalog_record_body") metadata = get_front_matter(catalog_body.get_attribute('innerHTML')) logging.info(metadata) dest_file_path = get_file_path(out_dir=out_dir, title_iast=metadata["title_iast"], author_iast=metadata.get("author_iast", ""), catalog_number=metadata.get( "Catalog number", "")) if os.path.exists(dest_file_path): logging.warning("Skipping %s - already exists.", dest_file_path) text_url = text_links[0].get_attribute("href") file = MdFile(file_path=dest_file_path, frontmatter_type="toml") text = get_text(url=text_url) text = text.replace("\n", " \n") file.dump_to_file(metadata=metadata, content=text, dry_run=False)
def dump_devanaagarii(source_html, dest_file): if os.path.exists(dest_file): logging.warning("Skipping %s as it exists", dest_file) return logging.info("Processing %s to %s", source_html, dest_file) with codecs.open(source_html, "r", 'utf-8') as file_in: contents = file_in.read() soup = BeautifulSoup(contents, 'lxml') metadata = {} metadata["title"] = soup.title.text.strip() lines = soup.text.split("\n") english_lines = itertools.takewhile( lambda x: x.strip() != "http://gretil.sub.uni-goettingen.de/gretil.htm", lines) intro = "\n\n## Intro\n%s" % (" \n".join(english_lines)) iast_lines = itertools.dropwhile( lambda x: x.strip() != "http://gretil.sub.uni-goettingen.de/gretil.htm", lines) text = " \n".join(list(iast_lines)[1:]) text = regex.sub("( \n){3,}", "\n\n", text) text = sanscript.transliterate(data=text, _from=sanscript.IAST, _to=sanscript.DEVANAGARI) text = "%s\n\n## पाठः\n%s" % (intro, text) out_file = MdFile(file_path=dest_file, frontmatter_type="toml") out_file.dump_to_file(metadata=metadata, content=text, dry_run=False)
def dump_text(base_dir): opts = options.Options() opts.headless = False browser = webdriver.Chrome(options=opts) browser.implicitly_wait(6) unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/shaunaka/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): subunit_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index]) for subunit_index in subunit_list: logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index) outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % ( kaanda_index, subunit_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title_tags = browser.find_elements_by_css_selector("#videotext strong") title = "%03d" % subunit_index if len(title_tags) > 0: title = "%03d %s" % (subunit_index, title_tags[0].text) title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False) browser.close()
def dump_text(browser, outdir): text_name = deduce_text_name(browser) out_file_path = get_output_path(text_name=text_name, outdir=outdir) text_spans = browser.find_elements_by_css_selector("#gvResults tr[valign=\"top\"] td span") text_segments = [span.text.strip().replace("\n", " \n") for span in text_spans] text = "\n\n".join(text_segments) md_file = MdFile(file_path=out_file_path) md_file.dump_to_file(metadata={"title": text_name}, content=text, dry_run=False)
def dump_markdown(src_file, dest_file): logging.info("Processing %s to %s", src_file, dest_file) metadata = get_metadata(src_file=src_file) text = get_text(src_file=src_file) metadata["title"] = sanscript.transliterate(data=metadata["itxtitle"], _from=sanscript.OPTITRANS, _to=sanscript.DEVANAGARI) md_file = MdFile(file_path=dest_file, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata=metadata, content=text, dry_run=False)
def dump_all_texts(dest_dir, overwrite=False): soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/") links = soup.select("div.wp-block-group a") for link in links: (title, text) = get_text(link["href"]) filename = file_helper.clean_file_path("%s.md" % title) dest_path = os.path.join(dest_dir, filename) if not overwrite and os.path.exists(dest_path): logging.warning("Skipping %s since it exists", dest_path) continue logging.info("Getting %s", link["href"]) md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML) md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False)
def apply_function(fn, dir_path, file_pattern="**/*.md", file_name_filter=None, frontmatter_type="yaml", start_file=None, *args, **kwargs): # logging.debug(list(Path(dir_path).glob(file_pattern))) if os.path.isfile(dir_path): logging.warning("Got a file actually. processing it!") md_files = [MdFile(file_path=dir_path)] else: md_files = get_md_files_from_path(dir_path=dir_path, file_pattern=file_pattern, file_name_filter=file_name_filter, frontmatter_type=frontmatter_type) start_file_reached = False logging.info("Processing %d files.", len(md_files)) from tqdm import tqdm for md_file in tqdm(md_files): if start_file is not None and not start_file_reached: if str(md_file.file_path) != start_file: continue else: start_file_reached = True logging.info("Processing %s", md_file) fn(md_file, *args, **kwargs)
def migrate_and_include(files, location_computer, new_url_computer, dry_run=False): logging.info("Processing %d files", len(files)) for f in files: new_path = location_computer(str(f)) logging.info("Moving %s to %s", str(f), new_path) md_file = MdFile(file_path=f) (metadata, _) = md_file.read_md_file() if not dry_run: os.makedirs(os.path.dirname(new_path), exist_ok=True) os.rename(src=f, dst=new_path) md = """<div class="js_include" url="%s" newLevelForH1="1" includeTitle="true"> </div>""" % new_url_computer( str(f)) logging.info("Inclusion in old file : %s", md) md_file.dump_to_file(metadata=metadata, content=md, dry_run=dry_run)
def get_titles(): titles_english = MdFile.get_metadata_field_values( md_files=raamaayana.get_adhyaaya_md_files(md_file_path), field_name="title") unnumbered_titles = [ regex.sub("^[०-९0-9]+ ", "", x) for x in titles_english ] logging.info("\n".join(unnumbered_titles))
def dump_ics_md_pair(panchaanga, period_str): ics_calendar = ics.compute_calendar(panchaanga) (year_type, year) = period_str.split("/") year = int(year) out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str( panchaanga.computation_system), year=year, year_type=year_type) output_file_ics = os.path.join(out_path + ".ics") ics.write_to_file(ics_calendar, output_file_ics) md_file = MdFile(file_path=output_file_ics.replace(".ics", ".md"), frontmatter_type=MdFile.YAML) intro = "## 00 Intro\n### Related files\n- [ics](../%s)\n" % str( os.path.basename(output_file_ics)) md_content = "%s\n%s" % (intro, md.make_md(panchaanga=panchaanga)) md_file.dump_to_file(metadata={"title": year}, content=md_content, dry_run=False) monthly_file_path = md_file.file_path.replace(".md", "_monthly.md") monthly_dir = monthly_file_path.replace(".md", "/") shutil.rmtree(path=monthly_dir, ignore_errors=True) logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir)) logging.info("Copying to %s", monthly_file_path) shutil.copy(md_file.file_path, monthly_file_path) monthly_md_file = MdFile(file_path=monthly_file_path) monthly_md_file.set_title_from_filename(dry_run=False, transliteration_target=None) monthly_md_file.split_to_bits(source_script=None, dry_run=False, indexed_title_pattern=None) library.apply_function(fn=MdFile.split_to_bits, dir_path=monthly_dir, frontmatter_type=MdFile.TOML, source_script=None, dry_run=False, indexed_title_pattern=None) logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir)) library.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False)
def import_md_recursive(source_dir, file_extension, source_format=None, dry_run=False): from pathlib import Path # logging.debug(list(Path(dir_path).glob(file_pattern))) source_paths = sorted(Path(source_dir).glob("**/*." + file_extension)) if source_format is None: source_format = file_extension for source_path in source_paths: md_path = str(source_path).replace("." + file_extension, ".md") md_path = file_helper.clean_file_path(md_path) if os.path.exists(md_path): logging.info("Skipping %s", md_path) continue logging.info("Processing %s to %s", source_path, md_path) md_file = MdFile(file_path=md_path, frontmatter_type=MdFile.TOML) md_file.import_with_pandoc(source_file=source_path, source_format=source_format, dry_run=dry_run)
def dump_item(title, item_url, outfile_path, get_collapsible_content): if os.path.exists(outfile_path): logging.info("skipping: %s - it exists already", outfile_path) return logging.info(item_url) browser.get(item_url) text = "" if not get_collapsible_content: try: text = browser.find_element_by_css_selector("div.poem").text except NoSuchElementException: content_element = browser.find_element_by_css_selector(".mw-parser-output") para_elements = content_element.find_elements_by_tag_name("p") text = "\n\n".join(map(lambda x : x.text, para_elements)) else: text = browser.find_element_by_css_selector(".mw-collapsible-content").text os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(text.replace("\n", " \n")) md_file = MdFile(file_path=outfile_path) md_file.set_title(title=title, dry_run=False)
def scrape_post_markdown(url, dir_path, dry_run): # construct file_name from the posts url parsed_url = urlsplit(url=url) file_name = (parsed_url.path).strip() # remove slashes, replace with dashes when dealing with urls like https://manasataramgini.wordpress.com/2020/06/08/pandemic-days-the-fizz-is-out-of-the-bottle/ file_name = regex.sub("/(....)/(..)/(..)/(.+)/", r"\1/\2/\1-\2-\3_\4.md", file_name) file_path = file_helper.clean_file_path( file_path=os.path.join(dir_path, file_name)) if os.path.exists(file_path): logging.warning("Skipping %s : exists", file_name) return (title, post_html) = get_post_html(url=url) logging.info("Dumping %s to %s with title %s.", url, file_path, title) md_file = MdFile(file_path=file_path, frontmatter_type=MdFile.TOML) md_file.import_content_with_pandoc(metadata={"title": title}, content=post_html, source_format="html", dry_run=dry_run)
def dump_content(soup, out_file_path, metadata, dry_run): content_elements = soup.select("td[width=\"60%\"]") + soup.select("td[width=\"80%\"]") + soup.select("body") content = fix_text(content_elements[0].decode_contents(formatter="html")) md_file = MdFile(file_path=out_file_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata) if metadata == {}: md_file.set_title_from_filename(transliteration_target=None, dry_run=dry_run)
def get_structured_text(browser, start_nodes, base_dir, unit_info_file): def open_path(subunit_path, unit_data): logging.debug(list(zip(subunit_path, unit_data["unitNameListInSite"]))) for (subunit, unitNameInSite) in zip(subunit_path, unit_data["unitNameListInSite"]): element_text = "%s%d" % (unitNameInSite, subunit) click_link_by_text(browser=browser, element_text=element_text) def close_path(subunit_path, unit_data): logging.info(list(zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"])))) for (subunit, unitNameInSite) in list(zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"]))): element_text = "%s%d" % (unitNameInSite, subunit) logging.info(element_text) click_link_by_text(browser=browser, element_text=element_text) browse_nodes(browser=browser, start_nodes=start_nodes) os.makedirs(name=base_dir, exist_ok=True) unit_data = text_data.get_subunit_data(unit_info_file, []) for subunit_path in text_data.get_subunit_path_list(json_file=unit_info_file, unit_path_list=[]): try: open_path(subunit_path=subunit_path, unit_data=unit_data) except NoSuchElementException as e: close_path(subunit_path=subunit_path, unit_data=unit_data) exit() logging.warning("Skipping as Could not find element " + str(traceback.format_exc())) continue outfile_path = os.path.join(base_dir, "/".join(map(str, subunit_path)) + ".md") if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) else: text_spans = browser.find_element_by_id("divResults").find_elements_by_tag_name("span") lines = ["\n", "\n"] for span in text_spans: lines.append(span.text + " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(lines) # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?! close_path(subunit_path=subunit_path, unit_data=unit_data) MdFile.fix_index_files(dir_path=base_dir)
def dump_summary(year, city, script=sanscript.DEVANAGARI, computation_system=ComputationSystem. MULTI_NEW_MOON_SIDEREAL_MONTH_ADHIKA__CHITRA_180, allow_precomputed=False): year_type = era.ERA_GREGORIAN logging.info( "Generating summary panchaanga for %s year %d (%s), with computation system %s ", city.name, year, year_type, str(computation_system)) panchaanga = annual.get_panchaanga_for_year( city=city, year=year, computation_system=computation_system, year_type=year_type, allow_precomputed=allow_precomputed) year_table = to_table_dict(panchaanga=panchaanga) out_path = get_canonical_path(city=panchaanga.city.name, computation_system_str=str( panchaanga.computation_system), year=year, year_type=year_type) os.makedirs(os.path.dirname(out_path), exist_ok=True) with codecs.open(out_path + ".toml", "w") as fp: toml.dump(year_table, fp) library.fix_index_files(dir_path=output_dir, transliteration_target=None, dry_run=False) computation_params = get_computation_parameters_md(panchaanga=panchaanga, scripts=[script]) out_path_md = out_path + "_summary.md" md = """## Intro\n%s\n\n## Table <div class="spreadsheet" src="../%s.toml" fullHeightWithRowsPerScreen=4> </div>""" % ( computation_params, str(year)) md_file = MdFile(file_path=out_path_md) md_file.dump_to_file(metadata={"title": "%d Summary" % (year)}, content=md, dry_run=False)
def get_md_files_from_path(dir_path, file_pattern, file_name_filter=None, frontmatter_type="yaml"): from pathlib import Path # logging.debug(list(Path(dir_path).glob(file_pattern))) md_file_paths = sorted( filter(file_name_filter, Path(dir_path).glob(file_pattern))) return [ MdFile(path, frontmatter_type=frontmatter_type) for path in md_file_paths ]
def make_full_text_md(source_dir, dry_run=False): from pathlib import Path # logging.debug(list(Path(dir_path).glob(file_pattern))) md = "" title = "पूर्णपाठः" rel_url = "../" num_md_files = 0 index_md_path = os.path.join(source_dir, "_index.md") if os.path.exists(index_md_path): index_md = MdFile(file_path=index_md_path) (index_yml, _) = index_md.read_md_file() title = "%s (%s)" % (index_yml["title"], title) md = "%s\n%s" % ( md, """<div class="js_include" url="%s" newLevelForH1="1" includeTitle="false"> </div>""" % (rel_url).strip()) num_md_files = num_md_files + 1 for subfile in sorted(os.listdir(source_dir)): subfile_path = os.path.join(source_dir, subfile) if os.path.isdir(subfile_path): if subfile not in ["images"]: make_full_text_md(source_dir=subfile_path) sub_md_file_path = os.path.join(subfile, "full.md") else: continue else: if subfile in ("full.md", "_index.md") or not str(subfile).endswith(".md"): continue sub_md_file_path = subfile num_md_files = num_md_files + 1 rel_url = os.path.join("..", regex.sub("\.md", "/", sub_md_file_path)) md = "%s\n%s" % ( md, """<div class="js_include" url="%s" newLevelForH1="1" includeTitle="true"> </div>""" % (rel_url).strip()) if num_md_files > 0: full_md_path = os.path.join(source_dir, "full.md") full_md = MdFile(file_path=full_md_path) full_md.dump_to_file(content=md, metadata={"title": title}, dry_run=dry_run) else: logging.info("No md files found in %s. Skipping.", source_dir)
def fix_index_files(dir_path, frontmatter_type=MdFile.TOML, transliteration_target=sanscript.DEVANAGARI, overwrite=False, dry_run=False): logging.info("Fixing index files") # Get all non hidden directories. dirs = [x[0] for x in os.walk(dir_path) if "/." not in x[0]] # set([os.path.dirname(path) for path in Path(dir_path).glob("**/")]) for dir in dirs: index_file = MdFile(file_path=os.path.join(dir, "_index.md"), frontmatter_type=frontmatter_type) if not os.path.exists(index_file.file_path): index_file.dump_to_file(metadata={}, content="", dry_run=dry_run) index_file.set_title_from_filename( transliteration_target=transliteration_target, dry_run=dry_run) elif overwrite: index_file.set_title_from_filename( transliteration_target=transliteration_target, dry_run=dry_run)
def dump_text_from_element(url, outfile_path, text_css_selector, title_maker, title_prefix="", html_fixer=None, md_fixer=None, dry_run=False): if os.path.exists(outfile_path): logging.info("skipping: %s - it exists already", outfile_path) return logging.info("Dumping: %s to %s", url, outfile_path) html = get_html(url=url) unaltered_soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser') if html_fixer is not None: html_fixer(soup) metadata = {"title": title_maker(soup, title_prefix)} # We definitely want to return the original html even if the file exists - we may need to navigate to the next element. if os.path.exists(outfile_path): logging.info("Skipping dumping: %s to %s", url, outfile_path) return unaltered_soup content = content_from_element(soup=soup, text_css_selector=text_css_selector, url=url) md_file = MdFile(file_path=outfile_path) md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata) if md_fixer is not None: [_, md] = md_file.read_md_file() md = md_fixer(md) md_file.replace_content(new_content=md, dry_run=dry_run) logging.info("Done: %s to %s", url, outfile_path) return unaltered_soup
def test_panchanga_chennai_2019(): panchaanga_2019 = Panchaanga.read_from_file( filename=os.path.join(TEST_DATA_PATH, 'Chennai-2019.json')) # We dump to md.txt rather than md to avoid slow checks on intellij ide. orig_md_file = os.path.join(TEST_DATA_PATH, 'Chennai-2019-devanagari.md.txt') current_md_output = os.path.join(TEST_DATA_PATH, 'Chennai-2019-devanagari.md.txt.local') md_file = MdFile(file_path=current_md_output) md_file.dump_to_file(metadata={"title": str(2019)}, content=md.make_md(panchaanga=panchaanga_2019), dry_run=False) if not os.path.exists(orig_md_file): logging.warning( "%s not present. Assuming that it was deliberately deleted to update test files.", orig_md_file) md_file = MdFile(file_path=orig_md_file) md_file.dump_to_file(metadata={"title": str(2019)}, content=md.make_md(panchaanga=panchaanga_2019), dry_run=False) with open(orig_md_file) as orig_tex: with open(current_md_output) as current_tex: assert current_tex.read() == orig_tex.read()
def transcribe(audio_path, output_path, model_id="vosk-model-en-in-0.4"): """ :param audio_path: :param model_id: (Get models from https://alphacephei.com/vosk/models and extract in vosk_models folder.) :return: """ model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "vosk_models", model_id) sample_rate = 16000 model = Model(model_path) rec = KaldiRecognizer(model, sample_rate) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', audio_path, '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) text = "" while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): # print(rec.Result()) result = json.loads(rec.Result()) text = "%s %s" % (text, result["text"]) else: # print(rec.PartialResult()) pass print(rec.FinalResult()) MdFile(file_path=output_path).dump_to_file(metadata={}, content=text, dry_run=False)
def get_numbers(): titles_english = MdFile.get_metadata_field_values( md_files=raamaayana.get_adhyaaya_md_files(md_file_path), field_name="title_english") numbers = [regex.sub("^([०-९0-9]+) .+", "\\1", x) for x in titles_english] logging.info("\n".join(numbers))
def transform(dry_run=False): json_paths = glob.glob( "/home/vvasuki/sanskrit/raw_etexts/vedaH/Rg/shakala/saMhitA/sAyaNabhAShyam/*/*/*.json", recursive=True) suukta_id_to_rk_map = {} for json_path in sorted(json_paths): with codecs.open(json_path, "r") as fp: rk = json.load(fp) suukta_id = "%02d/%03d" % (int(rk["classification"]["mandala"]), int(rk["classification"]["sukta"])) suukta_rk_map = suukta_id_to_rk_map.get(suukta_id, {}) bhaashya = regex.sub("<.+?>", "", rk["sayanaBhashya"]) rk_number = sanscript.transliterate( "%02d" % int(rk["classification"]["rik"]), sanscript.IAST, sanscript.DEVANAGARI) attribute_str = "%s। %s। %s।" % (rk["attribute"]["devata"], rk["attribute"]["rishi"], rk["attribute"]["chandas"]) padapaatha_lines = rk["padapaatha"]["lines"] if isinstance(padapaatha_lines, str): padapaatha_lines = [padapaatha_lines] samhita_lines = rk["samhitaAux"]["lines"] if isinstance(samhita_lines, str): samhita_lines = [samhita_lines] rk_md = "## अधिमन्त्रम्\n%s\n\n## मन्त्रः\n%s\n\n## पदपाठः\n%s\n\n## भाष्यम्\n%s" % ( attribute_str, " \n".join(samhita_lines), " \n".join(padapaatha_lines), bhaashya) suukta_rk_map[rk_number] = rk_md if bhaashya == "": logging.warning("No bhAShya for %s", rk["id"]) suukta_id_to_rk_map[suukta_id] = suukta_rk_map for suukta_id in suukta_id_to_rk_map.keys(): dest_path_suukta = os.path.join(dest_dir_suuktas, suukta_id + ".md") md_file_suukta = MdFile(file_path=dest_path_suukta) title = sanscript.transliterate( suukta_id.split("/")[-1], sanscript.IAST, sanscript.DEVANAGARI) rk_map = suukta_id_to_rk_map[suukta_id] suukta_md = "" for rk_id in sorted(rk_map.keys()): rk_md = rk_map[rk_id] dest_path_Rk = os.path.join( dest_dir_Rks, suukta_id, sanscript.transliterate(rk_id, sanscript.DEVANAGARI, sanscript.IAST) + ".md") md_file_Rk = MdFile(file_path=dest_path_Rk) rk_text = " ".join( doc_curation.md.section.get_section_lines( lines_in=rk_md.split("\n"), section_title="मन्त्रः")) from doc_curation import text_data title_Rk = text_data.get_rk_title(rk_id=rk_id, rk_text=rk_text) md_file_Rk.dump_to_file(metadata={"title": title_Rk}, content=rk_md, dry_run=dry_run) md_file_Rk.set_filename_from_title( transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run) dest_path_Rk = md_file_Rk.file_path suukta_md = suukta_md + """ <div class="js_include" url="%s" newLevelForH1="2" includeTitle="false"> </div> """ % dest_path_Rk.replace("/home/vvasuki/vvasuki-git", "").replace( "static/", "") import inspect md_file_suukta.dump_to_file(metadata={"title": title}, content=inspect.cleandoc(suukta_md), dry_run=dry_run)
def separate_rks(dry_run=False): dest_dir_Rks = "/home/vvasuki/vvasuki-git/vedAH/static/atharva/shaunakam/rUDha-saMhitA/mUlam/" suukta_paths = glob.glob("/home/vvasuki/vvasuki-git/vedAH/content/atharva/shaunakam/rUDha-saMhitA_alt/*/*.md", recursive=True) for suukta_path in suukta_paths: md_file = MdFile(file_path=suukta_path) [metadata, md] = md_file.read_md_file() lines = md.split("\n") meta_lines = list(itertools.takewhile(lambda line: "॒" not in line and "॑" not in line, lines)) lines = list(itertools.dropwhile(lambda line: "॒" not in line and "॑" not in line, lines)) lines = [line for line in lines if line != ""] rk_id = 0 chapter_id = suukta_path.split("/")[-2] suukta_id = metadata["title"].split()[0] suukta_id_roman = sanscript.transliterate(suukta_id, sanscript.DEVANAGARI, sanscript.IAST) suukta_title = " ".join(metadata["title"].split()[1:]).replace("।", "").strip() dest_path_suukta = os.path.join(dest_dir_suuktas, chapter_id, suukta_id_roman + ".md") rk_map = {} while(len(lines) > 0): lines_rk = list(itertools.takewhile(lambda line: "॥" not in line, lines)) lines_rk.append(lines[len(lines_rk)]) if len(lines) == len(lines_rk): lines = [] else: lines = lines[len(lines_rk):] rk_id = rk_id + 1 rk_md = "\n".join(lines_rk) rk_id_str = sanscript.transliterate("%02d" % rk_id, sanscript.IAST, sanscript.DEVANAGARI) from doc_curation import text_data title_Rk = text_data.get_rk_title(rk_id=rk_id_str, rk_text=rk_md) dest_path_Rk = os.path.join(dest_dir_Rks, chapter_id, suukta_id_roman, sanscript.transliterate(rk_id_str, sanscript.DEVANAGARI, sanscript.IAST) + ".md") md_file_Rk = MdFile(file_path=dest_path_Rk) md_file_Rk.dump_to_file(metadata={"title": title_Rk}, content=rk_md, dry_run=dry_run) md_file_Rk.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run) rk_map[rk_id_str] = md_file_Rk.file_path suukta_md = "" for rk_id in sorted(rk_map.keys()): dest_path_Rk = rk_map[rk_id] suukta_md = suukta_md + """ <div class="js_include" url="%s" newLevelForH1="2" includeTitle="false"> </div> """ % dest_path_Rk.replace("/home/vvasuki/vvasuki-git", "").replace("static/", "") import textwrap suukta_md = """ ## परिचयः %s ## पाठः %s """ % ("\n ".join(meta_lines), suukta_md) md_file_suukta = MdFile(file_path=dest_path_suukta) md_file_suukta.dump_to_file(metadata={"title": "%s %s" % (suukta_id, suukta_title)}, content=textwrap.dedent(suukta_md), dry_run=dry_run) md_file_suukta.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)
from doc_curation.md.file import MdFile # Remove all handlers associated with the root logger object. for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig( level=logging.DEBUG, format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s") def set_titles_from_spreadsheet(dir_path, dry_run=False): MdFile.fix_field_values( md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), spreadhsheet_id="1sNH1AWhhoa5VATqMdLbF652s7srTG0Raa6K-sCwDR-8", worksheet_name="कुम्भकोणाध्यायाः", id_column="क्रमाङ्कम्", value_column="अन्तिमशीर्षिका", md_file_to_id=mahaabhaarata.get_adhyaaya_id, dry_run=dry_run ) MdFile.devanaagarify_titles(md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), dry_run=dry_run) def get_upaakhyaana_and_titles_from_path(dir_path, file_pattern="**/*.md"): md_files = MdFile.get_md_files_from_path(dir_path=dir_path, file_pattern=file_pattern) titles = [md_file.get_title() for md_file in md_files] upaakhyaanas = [md_file.get_upaakhyaana() for md_file in md_files] for row in zip(upaakhyaanas, titles): print ("\t".join([str(i) for i in row])) dir_path = "/home/vvasuki/vvasuki-git/kAvya/content/TIkA/padyam/purANam/mahAbhAratam/03-vana-parva/" # set_titles_from_filenames(dir_path=dir_path, dry_run=True) # get_upaakhyaana_and_titles_from_path(dir_path=dir_path) MdFile.fix_index_files(dir_path=dir_path, dry_run=False) # set_titles_from_spreadsheet(dir_path=dir_path, dry_run=False)
def get_upaakhyaana_and_titles_from_path(dir_path, file_pattern="**/*.md"): md_files = MdFile.get_md_files_from_path(dir_path=dir_path, file_pattern=file_pattern) titles = [md_file.get_title() for md_file in md_files] upaakhyaanas = [md_file.get_upaakhyaana() for md_file in md_files] for row in zip(upaakhyaanas, titles): print ("\t".join([str(i) for i in row]))