示例#1
0
def set_titles_from_spreadsheet(dir_path, dry_run=False):
    MdFile.fix_field_values(
        md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path),
        spreadhsheet_id="1sNH1AWhhoa5VATqMdLbF652s7srTG0Raa6K-sCwDR-8",
        worksheet_name="कुम्भकोणाध्यायाः", id_column="क्रमाङ्कम्", value_column="अन्तिमशीर्षिका", md_file_to_id=mahaabhaarata.get_adhyaaya_id, dry_run=dry_run
    )
    MdFile.devanaagarify_titles(md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), dry_run=dry_run)
def dump_text(base_dir):
    unit_info_file = os.path.join(os.path.dirname(text_data.__file__),
                                  "vedaH/vAjasaneyi/samhitA.json")

    for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file,
                                                   unit_path_list=[]):
        logging.info("adhyAya %d", kaanda_index)

        outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index))
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
            continue

        url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % (
            kaanda_index)
        logging.info("url %s to %s", url, outfile_path)
        browser.get(url=url)
        try:
            text = browser.find_element_by_id("videotext").text
            text = text.replace("\n", "  \n")
            title = "%02d" % kaanda_index
            title = sanscript.transliterate(title, sanscript.HK,
                                            sanscript.DEVANAGARI)
            md_file = MdFile(file_path=outfile_path)
            md_file.dump_to_file(metadata={"title": title},
                                 content=text,
                                 dry_run=False)
        except NoSuchElementException:
            logging.warning("Page missing! %s ", url)
def process_catalog_page_selenium(url, out_dir):
    logging.info("Processing catalog %s", url)
    browser.get(url=url)
    text_links = browser.find_elements_by_link_text(
        "View in Unicode transliteration")
    if len(text_links) == 0:
        logging.warning("%s does not have text", url)
        return

    catalog_body = browser.find_element_by_css_selector(".catalog_record_body")
    metadata = get_front_matter(catalog_body.get_attribute('innerHTML'))
    logging.info(metadata)

    dest_file_path = get_file_path(out_dir=out_dir,
                                   title_iast=metadata["title_iast"],
                                   author_iast=metadata.get("author_iast", ""),
                                   catalog_number=metadata.get(
                                       "Catalog number", ""))
    if os.path.exists(dest_file_path):
        logging.warning("Skipping %s - already exists.", dest_file_path)

    text_url = text_links[0].get_attribute("href")
    file = MdFile(file_path=dest_file_path, frontmatter_type="toml")
    text = get_text(url=text_url)
    text = text.replace("\n", "  \n")
    file.dump_to_file(metadata=metadata, content=text, dry_run=False)
示例#4
0
def dump_devanaagarii(source_html, dest_file):
    if os.path.exists(dest_file):
        logging.warning("Skipping %s as it exists", dest_file)
        return
    logging.info("Processing %s to %s", source_html, dest_file)
    with codecs.open(source_html, "r", 'utf-8') as file_in:
        contents = file_in.read()
        soup = BeautifulSoup(contents, 'lxml')
        metadata = {}
        metadata["title"] = soup.title.text.strip()
        lines = soup.text.split("\n")
        english_lines = itertools.takewhile(
            lambda x: x.strip() !=
            "http://gretil.sub.uni-goettingen.de/gretil.htm", lines)
        intro = "\n\n## Intro\n%s" % ("  \n".join(english_lines))
        iast_lines = itertools.dropwhile(
            lambda x: x.strip() !=
            "http://gretil.sub.uni-goettingen.de/gretil.htm", lines)
        text = "  \n".join(list(iast_lines)[1:])
        text = regex.sub("(  \n){3,}", "\n\n", text)
        text = sanscript.transliterate(data=text,
                                       _from=sanscript.IAST,
                                       _to=sanscript.DEVANAGARI)
        text = "%s\n\n## पाठः\n%s" % (intro, text)
        out_file = MdFile(file_path=dest_file, frontmatter_type="toml")
        out_file.dump_to_file(metadata=metadata, content=text, dry_run=False)
示例#5
0
def dump_text(base_dir):
  opts = options.Options()
  opts.headless = False
  browser = webdriver.Chrome(options=opts)
  browser.implicitly_wait(6)
  unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/shaunaka/samhitA.json")

  for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]):
    subunit_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index])
    for subunit_index in subunit_list:
      logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index)

      outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index)
      if os.path.exists(outfile_path):
        logging.info("Skipping " + outfile_path)
        continue

      url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % (
      kaanda_index, subunit_index)
      logging.info("url %s to %s", url, outfile_path)
      browser.get(url=url)
      text = browser.find_element_by_id("videotext").text
      text = text.replace("\n", "  \n")
      title_tags = browser.find_elements_by_css_selector("#videotext  strong")
      title = "%03d" % subunit_index
      if len(title_tags) > 0:
        title = "%03d %s" % (subunit_index, title_tags[0].text)
      title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI)
      md_file = MdFile(file_path=outfile_path)
      md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False)

  browser.close()
def dump_text(browser, outdir):
    text_name = deduce_text_name(browser)
    out_file_path = get_output_path(text_name=text_name, outdir=outdir)
    text_spans = browser.find_elements_by_css_selector("#gvResults tr[valign=\"top\"] td span")
    text_segments = [span.text.strip().replace("\n", "  \n") for span in text_spans]
    text = "\n\n".join(text_segments)
    md_file = MdFile(file_path=out_file_path)
    md_file.dump_to_file(metadata={"title": text_name}, content=text, dry_run=False)
def dump_markdown(src_file, dest_file):
    logging.info("Processing %s to %s", src_file, dest_file)
    metadata = get_metadata(src_file=src_file)
    text = get_text(src_file=src_file)
    metadata["title"] = sanscript.transliterate(data=metadata["itxtitle"],
                                                _from=sanscript.OPTITRANS,
                                                _to=sanscript.DEVANAGARI)
    md_file = MdFile(file_path=dest_file, frontmatter_type=MdFile.TOML)
    md_file.dump_to_file(metadata=metadata, content=text, dry_run=False)
示例#8
0
def dump_all_texts(dest_dir, overwrite=False):
    soup = scraping.get_soup(url="https://adishila.com/unicodetxt-htm/")
    links = soup.select("div.wp-block-group a")
    for link in links:
        (title, text) = get_text(link["href"])
        filename = file_helper.clean_file_path("%s.md" % title)
        dest_path = os.path.join(dest_dir, filename)
        if not overwrite and os.path.exists(dest_path):
            logging.warning("Skipping %s since it exists", dest_path)
            continue
        logging.info("Getting %s", link["href"])
        md_file = MdFile(file_path=dest_path, frontmatter_type=MdFile.TOML)
        md_file.dump_to_file(metadata={"title": title},
                             content=text,
                             dry_run=False)
示例#9
0
def apply_function(fn,
                   dir_path,
                   file_pattern="**/*.md",
                   file_name_filter=None,
                   frontmatter_type="yaml",
                   start_file=None,
                   *args,
                   **kwargs):
    # logging.debug(list(Path(dir_path).glob(file_pattern)))
    if os.path.isfile(dir_path):
        logging.warning("Got a file actually. processing it!")
        md_files = [MdFile(file_path=dir_path)]
    else:
        md_files = get_md_files_from_path(dir_path=dir_path,
                                          file_pattern=file_pattern,
                                          file_name_filter=file_name_filter,
                                          frontmatter_type=frontmatter_type)
    start_file_reached = False

    logging.info("Processing %d files.", len(md_files))
    from tqdm import tqdm
    for md_file in tqdm(md_files):
        if start_file is not None and not start_file_reached:
            if str(md_file.file_path) != start_file:
                continue
            else:
                start_file_reached = True
        logging.info("Processing %s", md_file)
        fn(md_file, *args, **kwargs)
示例#10
0
def migrate_and_include(files,
                        location_computer,
                        new_url_computer,
                        dry_run=False):
    logging.info("Processing %d files", len(files))
    for f in files:
        new_path = location_computer(str(f))
        logging.info("Moving %s to %s", str(f), new_path)
        md_file = MdFile(file_path=f)
        (metadata, _) = md_file.read_md_file()
        if not dry_run:
            os.makedirs(os.path.dirname(new_path), exist_ok=True)
            os.rename(src=f, dst=new_path)
        md = """<div class="js_include" url="%s"  newLevelForH1="1" includeTitle="true"> </div>""" % new_url_computer(
            str(f))
        logging.info("Inclusion in old file : %s", md)
        md_file.dump_to_file(metadata=metadata, content=md, dry_run=dry_run)
示例#11
0
def get_titles():
    titles_english = MdFile.get_metadata_field_values(
        md_files=raamaayana.get_adhyaaya_md_files(md_file_path),
        field_name="title")
    unnumbered_titles = [
        regex.sub("^[०-९0-9]+ ", "", x) for x in titles_english
    ]
    logging.info("\n".join(unnumbered_titles))
示例#12
0
def dump_ics_md_pair(panchaanga, period_str):
    ics_calendar = ics.compute_calendar(panchaanga)
    (year_type, year) = period_str.split("/")
    year = int(year)
    out_path = get_canonical_path(city=panchaanga.city.name,
                                  computation_system_str=str(
                                      panchaanga.computation_system),
                                  year=year,
                                  year_type=year_type)
    output_file_ics = os.path.join(out_path + ".ics")
    ics.write_to_file(ics_calendar, output_file_ics)

    md_file = MdFile(file_path=output_file_ics.replace(".ics", ".md"),
                     frontmatter_type=MdFile.YAML)
    intro = "## 00 Intro\n### Related files\n- [ics](../%s)\n" % str(
        os.path.basename(output_file_ics))
    md_content = "%s\n%s" % (intro, md.make_md(panchaanga=panchaanga))
    md_file.dump_to_file(metadata={"title": year},
                         content=md_content,
                         dry_run=False)

    monthly_file_path = md_file.file_path.replace(".md", "_monthly.md")
    monthly_dir = monthly_file_path.replace(".md", "/")
    shutil.rmtree(path=monthly_dir, ignore_errors=True)
    logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir))
    logging.info("Copying to %s", monthly_file_path)
    shutil.copy(md_file.file_path, monthly_file_path)
    monthly_md_file = MdFile(file_path=monthly_file_path)
    monthly_md_file.set_title_from_filename(dry_run=False,
                                            transliteration_target=None)
    monthly_md_file.split_to_bits(source_script=None,
                                  dry_run=False,
                                  indexed_title_pattern=None)
    library.apply_function(fn=MdFile.split_to_bits,
                           dir_path=monthly_dir,
                           frontmatter_type=MdFile.TOML,
                           source_script=None,
                           dry_run=False,
                           indexed_title_pattern=None)
    logging.info("%s exists? %s", monthly_dir, os.path.exists(monthly_dir))

    library.fix_index_files(dir_path=output_dir,
                            transliteration_target=None,
                            dry_run=False)
示例#13
0
def import_md_recursive(source_dir,
                        file_extension,
                        source_format=None,
                        dry_run=False):
    from pathlib import Path
    # logging.debug(list(Path(dir_path).glob(file_pattern)))
    source_paths = sorted(Path(source_dir).glob("**/*." + file_extension))
    if source_format is None:
        source_format = file_extension
    for source_path in source_paths:
        md_path = str(source_path).replace("." + file_extension, ".md")
        md_path = file_helper.clean_file_path(md_path)
        if os.path.exists(md_path):
            logging.info("Skipping %s", md_path)
            continue
        logging.info("Processing %s to %s", source_path, md_path)
        md_file = MdFile(file_path=md_path, frontmatter_type=MdFile.TOML)
        md_file.import_with_pandoc(source_file=source_path,
                                   source_format=source_format,
                                   dry_run=dry_run)
示例#14
0
def dump_item(title, item_url, outfile_path, get_collapsible_content):
    if os.path.exists(outfile_path):
        logging.info("skipping: %s - it exists already", outfile_path)
        return
    logging.info(item_url)
    browser.get(item_url)
    text = ""
    if not get_collapsible_content:
        try:
            text = browser.find_element_by_css_selector("div.poem").text
        except NoSuchElementException:
            content_element = browser.find_element_by_css_selector(".mw-parser-output")
            para_elements = content_element.find_elements_by_tag_name("p")
            text = "\n\n".join(map(lambda x : x.text, para_elements))
    else:
        text = browser.find_element_by_css_selector(".mw-collapsible-content").text
    os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
    with open(outfile_path, "w") as outfile:
        outfile.writelines(text.replace("\n", "  \n"))
    md_file = MdFile(file_path=outfile_path)
    md_file.set_title(title=title, dry_run=False)
示例#15
0
def scrape_post_markdown(url, dir_path, dry_run):
    # construct file_name from the posts url
    parsed_url = urlsplit(url=url)
    file_name = (parsed_url.path).strip()
    # remove slashes, replace with dashes when dealing with urls like https://manasataramgini.wordpress.com/2020/06/08/pandemic-days-the-fizz-is-out-of-the-bottle/
    file_name = regex.sub("/(....)/(..)/(..)/(.+)/", r"\1/\2/\1-\2-\3_\4.md",
                          file_name)
    file_path = file_helper.clean_file_path(
        file_path=os.path.join(dir_path, file_name))

    if os.path.exists(file_path):
        logging.warning("Skipping %s : exists", file_name)
        return
    (title, post_html) = get_post_html(url=url)
    logging.info("Dumping %s to %s with title %s.", url, file_path, title)

    md_file = MdFile(file_path=file_path, frontmatter_type=MdFile.TOML)
    md_file.import_content_with_pandoc(metadata={"title": title},
                                       content=post_html,
                                       source_format="html",
                                       dry_run=dry_run)
示例#16
0
def dump_content(soup, out_file_path, metadata, dry_run):
    content_elements = soup.select("td[width=\"60%\"]") + soup.select("td[width=\"80%\"]") + soup.select("body")
    content = fix_text(content_elements[0].decode_contents(formatter="html"))
    md_file = MdFile(file_path=out_file_path)
    md_file.import_content_with_pandoc(content=content, source_format="html", dry_run=dry_run, metadata=metadata)
    if metadata == {}:
        md_file.set_title_from_filename(transliteration_target=None, dry_run=dry_run)
示例#17
0
def get_structured_text(browser, start_nodes, base_dir, unit_info_file):
    def open_path(subunit_path, unit_data):
        logging.debug(list(zip(subunit_path, unit_data["unitNameListInSite"])))
        for (subunit, unitNameInSite) in zip(subunit_path, unit_data["unitNameListInSite"]):
            element_text = "%s%d" % (unitNameInSite, subunit)
            click_link_by_text(browser=browser, element_text=element_text)

    def close_path(subunit_path, unit_data):
        logging.info(list(zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"]))))
        for (subunit, unitNameInSite) in list(zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"]))):
            element_text = "%s%d" % (unitNameInSite, subunit)
            logging.info(element_text)
            click_link_by_text(browser=browser, element_text=element_text)

    browse_nodes(browser=browser, start_nodes=start_nodes)
    os.makedirs(name=base_dir, exist_ok=True)
    unit_data = text_data.get_subunit_data(unit_info_file, [])

    for subunit_path in text_data.get_subunit_path_list(json_file=unit_info_file, unit_path_list=[]):
        try:
            open_path(subunit_path=subunit_path, unit_data=unit_data)
        except NoSuchElementException as e:
            close_path(subunit_path=subunit_path, unit_data=unit_data)
            exit()
            logging.warning("Skipping as Could not find element " + str(traceback.format_exc()))
            continue
        outfile_path = os.path.join(base_dir, "/".join(map(str, subunit_path)) + ".md")
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
        else:
            text_spans = browser.find_element_by_id("divResults").find_elements_by_tag_name("span")
            lines = ["\n", "\n"]
            for span in text_spans:
                lines.append(span.text + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)
        # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?!
        close_path(subunit_path=subunit_path, unit_data=unit_data)
    MdFile.fix_index_files(dir_path=base_dir)
示例#18
0
def dump_summary(year,
                 city,
                 script=sanscript.DEVANAGARI,
                 computation_system=ComputationSystem.
                 MULTI_NEW_MOON_SIDEREAL_MONTH_ADHIKA__CHITRA_180,
                 allow_precomputed=False):
    year_type = era.ERA_GREGORIAN
    logging.info(
        "Generating summary panchaanga for %s year %d (%s), with computation system %s ",
        city.name, year, year_type, str(computation_system))
    panchaanga = annual.get_panchaanga_for_year(
        city=city,
        year=year,
        computation_system=computation_system,
        year_type=year_type,
        allow_precomputed=allow_precomputed)
    year_table = to_table_dict(panchaanga=panchaanga)
    out_path = get_canonical_path(city=panchaanga.city.name,
                                  computation_system_str=str(
                                      panchaanga.computation_system),
                                  year=year,
                                  year_type=year_type)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with codecs.open(out_path + ".toml", "w") as fp:
        toml.dump(year_table, fp)
    library.fix_index_files(dir_path=output_dir,
                            transliteration_target=None,
                            dry_run=False)

    computation_params = get_computation_parameters_md(panchaanga=panchaanga,
                                                       scripts=[script])
    out_path_md = out_path + "_summary.md"
    md = """## Intro\n%s\n\n## Table
  <div class="spreadsheet" src="../%s.toml" fullHeightWithRowsPerScreen=4> </div>""" % (
        computation_params, str(year))
    md_file = MdFile(file_path=out_path_md)
    md_file.dump_to_file(metadata={"title": "%d Summary" % (year)},
                         content=md,
                         dry_run=False)
示例#19
0
def get_md_files_from_path(dir_path,
                           file_pattern,
                           file_name_filter=None,
                           frontmatter_type="yaml"):
    from pathlib import Path
    # logging.debug(list(Path(dir_path).glob(file_pattern)))
    md_file_paths = sorted(
        filter(file_name_filter,
               Path(dir_path).glob(file_pattern)))
    return [
        MdFile(path, frontmatter_type=frontmatter_type)
        for path in md_file_paths
    ]
示例#20
0
def make_full_text_md(source_dir, dry_run=False):
    from pathlib import Path
    # logging.debug(list(Path(dir_path).glob(file_pattern)))
    md = ""
    title = "पूर्णपाठः"
    rel_url = "../"

    num_md_files = 0

    index_md_path = os.path.join(source_dir, "_index.md")
    if os.path.exists(index_md_path):
        index_md = MdFile(file_path=index_md_path)
        (index_yml, _) = index_md.read_md_file()
        title = "%s (%s)" % (index_yml["title"], title)
        md = "%s\n%s" % (
            md,
            """<div class="js_include" url="%s"  newLevelForH1="1" includeTitle="false"> </div>"""
            % (rel_url).strip())
        num_md_files = num_md_files + 1

    for subfile in sorted(os.listdir(source_dir)):
        subfile_path = os.path.join(source_dir, subfile)
        if os.path.isdir(subfile_path):
            if subfile not in ["images"]:
                make_full_text_md(source_dir=subfile_path)
                sub_md_file_path = os.path.join(subfile, "full.md")
            else:
                continue
        else:
            if subfile in ("full.md",
                           "_index.md") or not str(subfile).endswith(".md"):
                continue
            sub_md_file_path = subfile

        num_md_files = num_md_files + 1
        rel_url = os.path.join("..", regex.sub("\.md", "/", sub_md_file_path))
        md = "%s\n%s" % (
            md,
            """<div class="js_include" url="%s"  newLevelForH1="1" includeTitle="true"> </div>"""
            % (rel_url).strip())

    if num_md_files > 0:
        full_md_path = os.path.join(source_dir, "full.md")
        full_md = MdFile(file_path=full_md_path)
        full_md.dump_to_file(content=md,
                             metadata={"title": title},
                             dry_run=dry_run)
    else:
        logging.info("No md files found in %s. Skipping.", source_dir)
示例#21
0
def fix_index_files(dir_path,
                    frontmatter_type=MdFile.TOML,
                    transliteration_target=sanscript.DEVANAGARI,
                    overwrite=False,
                    dry_run=False):
    logging.info("Fixing index files")
    # Get all non hidden directories.
    dirs = [x[0] for x in os.walk(dir_path) if "/." not in x[0]]
    # set([os.path.dirname(path) for path in Path(dir_path).glob("**/")])
    for dir in dirs:
        index_file = MdFile(file_path=os.path.join(dir, "_index.md"),
                            frontmatter_type=frontmatter_type)
        if not os.path.exists(index_file.file_path):
            index_file.dump_to_file(metadata={}, content="", dry_run=dry_run)
            index_file.set_title_from_filename(
                transliteration_target=transliteration_target, dry_run=dry_run)
        elif overwrite:
            index_file.set_title_from_filename(
                transliteration_target=transliteration_target, dry_run=dry_run)
示例#22
0
def dump_text_from_element(url,
                           outfile_path,
                           text_css_selector,
                           title_maker,
                           title_prefix="",
                           html_fixer=None,
                           md_fixer=None,
                           dry_run=False):
    if os.path.exists(outfile_path):
        logging.info("skipping: %s - it exists already", outfile_path)
        return
    logging.info("Dumping: %s to %s", url, outfile_path)
    html = get_html(url=url)
    unaltered_soup = BeautifulSoup(html, 'html.parser')
    soup = BeautifulSoup(html, 'html.parser')

    if html_fixer is not None:
        html_fixer(soup)

    metadata = {"title": title_maker(soup, title_prefix)}

    # We definitely want to return the original html even if the file exists - we may need to navigate to the next element.
    if os.path.exists(outfile_path):
        logging.info("Skipping dumping: %s to %s", url, outfile_path)
        return unaltered_soup

    content = content_from_element(soup=soup,
                                   text_css_selector=text_css_selector,
                                   url=url)

    md_file = MdFile(file_path=outfile_path)
    md_file.import_content_with_pandoc(content=content,
                                       source_format="html",
                                       dry_run=dry_run,
                                       metadata=metadata)
    if md_fixer is not None:
        [_, md] = md_file.read_md_file()
        md = md_fixer(md)
        md_file.replace_content(new_content=md, dry_run=dry_run)

    logging.info("Done: %s to %s", url, outfile_path)
    return unaltered_soup
示例#23
0
def test_panchanga_chennai_2019():
    panchaanga_2019 = Panchaanga.read_from_file(
        filename=os.path.join(TEST_DATA_PATH, 'Chennai-2019.json'))
    # We dump to md.txt rather than md to avoid slow checks on intellij ide.
    orig_md_file = os.path.join(TEST_DATA_PATH,
                                'Chennai-2019-devanagari.md.txt')
    current_md_output = os.path.join(TEST_DATA_PATH,
                                     'Chennai-2019-devanagari.md.txt.local')
    md_file = MdFile(file_path=current_md_output)
    md_file.dump_to_file(metadata={"title": str(2019)},
                         content=md.make_md(panchaanga=panchaanga_2019),
                         dry_run=False)
    if not os.path.exists(orig_md_file):
        logging.warning(
            "%s not present. Assuming that it was deliberately deleted to update test files.",
            orig_md_file)
        md_file = MdFile(file_path=orig_md_file)
        md_file.dump_to_file(metadata={"title": str(2019)},
                             content=md.make_md(panchaanga=panchaanga_2019),
                             dry_run=False)

    with open(orig_md_file) as orig_tex:
        with open(current_md_output) as current_tex:
            assert current_tex.read() == orig_tex.read()
示例#24
0
def transcribe(audio_path, output_path, model_id="vosk-model-en-in-0.4"):
    """
  
  :param audio_path: 
  :param model_id: 
  (Get models from https://alphacephei.com/vosk/models and extract in vosk_models folder.)
  :return: 
  """
    model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "vosk_models", model_id)
    sample_rate = 16000
    model = Model(model_path)
    rec = KaldiRecognizer(model, sample_rate)

    process = subprocess.Popen([
        'ffmpeg', '-loglevel', 'quiet', '-i', audio_path, '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ],
                               stdout=subprocess.PIPE)

    text = ""
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            # print(rec.Result())
            result = json.loads(rec.Result())
            text = "%s %s" % (text, result["text"])
        else:
            # print(rec.PartialResult())
            pass

    print(rec.FinalResult())
    MdFile(file_path=output_path).dump_to_file(metadata={},
                                               content=text,
                                               dry_run=False)
示例#25
0
def get_numbers():
    titles_english = MdFile.get_metadata_field_values(
        md_files=raamaayana.get_adhyaaya_md_files(md_file_path),
        field_name="title_english")
    numbers = [regex.sub("^([०-९0-9]+) .+", "\\1", x) for x in titles_english]
    logging.info("\n".join(numbers))
示例#26
0
def transform(dry_run=False):
    json_paths = glob.glob(
        "/home/vvasuki/sanskrit/raw_etexts/vedaH/Rg/shakala/saMhitA/sAyaNabhAShyam/*/*/*.json",
        recursive=True)
    suukta_id_to_rk_map = {}
    for json_path in sorted(json_paths):
        with codecs.open(json_path, "r") as fp:
            rk = json.load(fp)
            suukta_id = "%02d/%03d" % (int(rk["classification"]["mandala"]),
                                       int(rk["classification"]["sukta"]))
            suukta_rk_map = suukta_id_to_rk_map.get(suukta_id, {})
            bhaashya = regex.sub("<.+?>", "", rk["sayanaBhashya"])
            rk_number = sanscript.transliterate(
                "%02d" % int(rk["classification"]["rik"]), sanscript.IAST,
                sanscript.DEVANAGARI)
            attribute_str = "%s। %s। %s।" % (rk["attribute"]["devata"],
                                             rk["attribute"]["rishi"],
                                             rk["attribute"]["chandas"])
            padapaatha_lines = rk["padapaatha"]["lines"]
            if isinstance(padapaatha_lines, str):
                padapaatha_lines = [padapaatha_lines]
            samhita_lines = rk["samhitaAux"]["lines"]
            if isinstance(samhita_lines, str):
                samhita_lines = [samhita_lines]
            rk_md = "## अधिमन्त्रम्\n%s\n\n## मन्त्रः\n%s\n\n## पदपाठः\n%s\n\n## भाष्यम्\n%s" % (
                attribute_str, "  \n".join(samhita_lines),
                "  \n".join(padapaatha_lines), bhaashya)
            suukta_rk_map[rk_number] = rk_md
            if bhaashya == "":
                logging.warning("No bhAShya for %s", rk["id"])
            suukta_id_to_rk_map[suukta_id] = suukta_rk_map

    for suukta_id in suukta_id_to_rk_map.keys():
        dest_path_suukta = os.path.join(dest_dir_suuktas, suukta_id + ".md")
        md_file_suukta = MdFile(file_path=dest_path_suukta)
        title = sanscript.transliterate(
            suukta_id.split("/")[-1], sanscript.IAST, sanscript.DEVANAGARI)
        rk_map = suukta_id_to_rk_map[suukta_id]
        suukta_md = ""
        for rk_id in sorted(rk_map.keys()):
            rk_md = rk_map[rk_id]
            dest_path_Rk = os.path.join(
                dest_dir_Rks, suukta_id,
                sanscript.transliterate(rk_id, sanscript.DEVANAGARI,
                                        sanscript.IAST) + ".md")
            md_file_Rk = MdFile(file_path=dest_path_Rk)
            rk_text = " ".join(
                doc_curation.md.section.get_section_lines(
                    lines_in=rk_md.split("\n"), section_title="मन्त्रः"))
            from doc_curation import text_data
            title_Rk = text_data.get_rk_title(rk_id=rk_id, rk_text=rk_text)
            md_file_Rk.dump_to_file(metadata={"title": title_Rk},
                                    content=rk_md,
                                    dry_run=dry_run)
            md_file_Rk.set_filename_from_title(
                transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)
            dest_path_Rk = md_file_Rk.file_path

            suukta_md = suukta_md + """
      
      <div class="js_include" url="%s"  newLevelForH1="2" includeTitle="false"> </div> 
      """ % dest_path_Rk.replace("/home/vvasuki/vvasuki-git", "").replace(
                "static/", "")

        import inspect
        md_file_suukta.dump_to_file(metadata={"title": title},
                                    content=inspect.cleandoc(suukta_md),
                                    dry_run=dry_run)
示例#27
0
def separate_rks(dry_run=False):
  dest_dir_Rks = "/home/vvasuki/vvasuki-git/vedAH/static/atharva/shaunakam/rUDha-saMhitA/mUlam/"
  suukta_paths = glob.glob("/home/vvasuki/vvasuki-git/vedAH/content/atharva/shaunakam/rUDha-saMhitA_alt/*/*.md",
                           recursive=True)

  for suukta_path in suukta_paths:
    md_file = MdFile(file_path=suukta_path)
    [metadata, md] = md_file.read_md_file()
    lines = md.split("\n")
    meta_lines = list(itertools.takewhile(lambda line: "॒" not in line and "॑" not in line, lines))
    lines = list(itertools.dropwhile(lambda line: "॒" not in line and "॑" not in line, lines))
    lines = [line for line in lines if line != ""]
    rk_id = 0
    chapter_id = suukta_path.split("/")[-2]
    suukta_id = metadata["title"].split()[0]
    suukta_id_roman = sanscript.transliterate(suukta_id, sanscript.DEVANAGARI, sanscript.IAST)
    suukta_title = " ".join(metadata["title"].split()[1:]).replace("।", "").strip()
    dest_path_suukta = os.path.join(dest_dir_suuktas, chapter_id, suukta_id_roman + ".md")
    rk_map = {}
    while(len(lines) > 0):
      lines_rk = list(itertools.takewhile(lambda line: "॥" not in line, lines))
      lines_rk.append(lines[len(lines_rk)])
      if len(lines) == len(lines_rk):
        lines = []
      else:
        lines = lines[len(lines_rk):]
      rk_id = rk_id + 1
      rk_md = "\n".join(lines_rk)

      rk_id_str = sanscript.transliterate("%02d" % rk_id, sanscript.IAST, sanscript.DEVANAGARI) 
      from doc_curation import text_data
      title_Rk = text_data.get_rk_title(rk_id=rk_id_str, rk_text=rk_md)
      dest_path_Rk = os.path.join(dest_dir_Rks, chapter_id, suukta_id_roman, sanscript.transliterate(rk_id_str, sanscript.DEVANAGARI, sanscript.IAST) + ".md")
      md_file_Rk = MdFile(file_path=dest_path_Rk)
      md_file_Rk.dump_to_file(metadata={"title": title_Rk}, content=rk_md, dry_run=dry_run)
      md_file_Rk.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)
      rk_map[rk_id_str] = md_file_Rk.file_path

    suukta_md = ""
    for rk_id in sorted(rk_map.keys()):
      dest_path_Rk = rk_map[rk_id]
      suukta_md = suukta_md + """
      <div class="js_include" url="%s"  newLevelForH1="2" includeTitle="false"> </div> 
      """ % dest_path_Rk.replace("/home/vvasuki/vvasuki-git", "").replace("static/", "")

    import textwrap
    suukta_md = """
    ## परिचयः
    %s
    
    ## पाठः
    %s
    """ % ("\n    ".join(meta_lines), suukta_md)
    md_file_suukta = MdFile(file_path=dest_path_suukta)
    md_file_suukta.dump_to_file(metadata={"title": "%s %s" % (suukta_id, suukta_title)}, content=textwrap.dedent(suukta_md), dry_run=dry_run)
    md_file_suukta.set_filename_from_title(transliteration_source=sanscript.DEVANAGARI, dry_run=dry_run)
示例#28
0
from doc_curation.md.file import MdFile

# Remove all handlers associated with the root logger object.
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(
    level=logging.DEBUG,
    format="%(levelname)s:%(asctime)s:%(module)s:%(lineno)d %(message)s")


def set_titles_from_spreadsheet(dir_path, dry_run=False):
    MdFile.fix_field_values(
        md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path),
        spreadhsheet_id="1sNH1AWhhoa5VATqMdLbF652s7srTG0Raa6K-sCwDR-8",
        worksheet_name="कुम्भकोणाध्यायाः", id_column="क्रमाङ्कम्", value_column="अन्तिमशीर्षिका", md_file_to_id=mahaabhaarata.get_adhyaaya_id, dry_run=dry_run
    )
    MdFile.devanaagarify_titles(md_files=mahaabhaarata.get_adhyaaya_md_files(dir_path), dry_run=dry_run)


def get_upaakhyaana_and_titles_from_path(dir_path, file_pattern="**/*.md"):
    md_files = MdFile.get_md_files_from_path(dir_path=dir_path, file_pattern=file_pattern)
    titles = [md_file.get_title() for md_file in md_files]
    upaakhyaanas = [md_file.get_upaakhyaana() for md_file in md_files]
    for row in zip(upaakhyaanas, titles):
        print ("\t".join([str(i) for i in row]))

dir_path = "/home/vvasuki/vvasuki-git/kAvya/content/TIkA/padyam/purANam/mahAbhAratam/03-vana-parva/"
# set_titles_from_filenames(dir_path=dir_path, dry_run=True)
# get_upaakhyaana_and_titles_from_path(dir_path=dir_path)
MdFile.fix_index_files(dir_path=dir_path, dry_run=False)
# set_titles_from_spreadsheet(dir_path=dir_path, dry_run=False)
示例#29
0
def get_upaakhyaana_and_titles_from_path(dir_path, file_pattern="**/*.md"):
    md_files = MdFile.get_md_files_from_path(dir_path=dir_path, file_pattern=file_pattern)
    titles = [md_file.get_title() for md_file in md_files]
    upaakhyaanas = [md_file.get_upaakhyaana() for md_file in md_files]
    for row in zip(upaakhyaanas, titles):
        print ("\t".join([str(i) for i in row]))