Пример #1
0
def dump_deep_text(url_text_id,
                   url_leaf_id_padding,
                   dir_path,
                   unit_info_file,
                   get_collapsible_content=False,
                   dry_run=False):
    unit_data = text_data.get_subunit_data(unit_info_file, [])
    for subunit_path in text_data.get_subunit_path_list(
            json_file=unit_info_file, unit_path_list=[]):
        relative_dir_path = "/".join(["%02d" % x for x in subunit_path[:-1]])
        outfile_path = os.path.join(dir_path, relative_dir_path,
                                    "%03d.md" % subunit_path[-1])
        import urllib
        item_url = "https://sa.wikisource.org/wiki/%s/%s" % (
            urllib.parse.quote(url_text_id),
            get_wiki_path(subunit_path=subunit_path,
                          unit_data=unit_data,
                          url_id_padding=url_leaf_id_padding))
        title = sanscript.transliterate("%03d" % subunit_path[-1],
                                        sanscript.SLP1, sanscript.DEVANAGARI)
        logging.info("Getting %s to %s with title %s", item_url, outfile_path,
                     title)
        if not dry_run:
            dump_item(title=title,
                      outfile_path=outfile_path,
                      item_url=item_url,
                      get_collapsible_content=get_collapsible_content)
Пример #2
0
def get_text(browser, text_id, base_dir, unit_info_file):
    parankusha.click_link_by_text(browser=browser, element_text=text_id)
    unit_data = text_data.get_subunit_data(unit_info_file, [])
    
    for subunit_path in text_data.get_subunit_path_list(json_file=unit_info_file, unit_path_list=[]):
        try:
            open_path(subunit_path=subunit_path, unit_data=unit_data)
        except NoSuchElementException as e:
            close_path(subunit_path=subunit_path, unit_data=unit_data)
            exit()
            logging.warning("Skipping as Could not find element " + str(traceback.format_exc()))
            continue
        outfile_path = os.path.join(base_dir, "/".join(map(str, subunit_path)) + ".md")
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
        else:
            text_spans = browser.find_element_by_id("divResults").find_elements_by_tag_name("span")
            lines = ["\n", "\n"]
            for span in text_spans:
                lines.append(span.text + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)
        # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?!
        close_path(subunit_path=subunit_path, unit_data=unit_data)
Пример #3
0
def get_structured_text(browser, start_nodes, base_dir, unit_info_file):
    def open_path(subunit_path, unit_data):
        logging.debug(list(zip(subunit_path, unit_data["unitNameListInSite"])))
        for (subunit, unitNameInSite) in zip(subunit_path,
                                             unit_data["unitNameListInSite"]):
            element_text = "%s%d" % (unitNameInSite, subunit)
            click_link_by_text(browser=browser, element_text=element_text)

    def close_path(subunit_path, unit_data):
        logging.info(
            list(
                zip(reversed(subunit_path),
                    reversed(unit_data["unitNameListInSite"]))))
        for (subunit, unitNameInSite) in list(
                zip(reversed(subunit_path),
                    reversed(unit_data["unitNameListInSite"]))):
            element_text = "%s%d" % (unitNameInSite, subunit)
            logging.info(element_text)
            click_link_by_text(browser=browser, element_text=element_text)

    browse_nodes(browser=browser, start_nodes=start_nodes)
    os.makedirs(name=base_dir, exist_ok=True)
    unit_data = text_data.get_subunit_data(unit_info_file, [])

    for subunit_path in text_data.get_subunit_path_list(
            json_file=unit_info_file, unit_path_list=[]):
        try:
            open_path(subunit_path=subunit_path, unit_data=unit_data)
        except NoSuchElementException as e:
            close_path(subunit_path=subunit_path, unit_data=unit_data)
            exit()
            logging.warning("Skipping as Could not find element " +
                            str(traceback.format_exc()))
            continue
        outfile_path = os.path.join(base_dir,
                                    "/".join(map(str, subunit_path)) + ".md")
        if os.path.exists(outfile_path):
            logging.info("Skipping " + outfile_path)
        else:
            text_spans = browser.find_element_by_id(
                "divResults").find_elements_by_tag_name("span")
            lines = ["\n", "\n"]
            for span in text_spans:
                lines.append(span.text + "  \n")
            os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True)
            with open(outfile_path, "w") as outfile:
                outfile.writelines(lines)
        # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?!
        close_path(subunit_path=subunit_path, unit_data=unit_data)
    md_helper.MdFile.fix_index_files(dir_path=base_dir)
Пример #4
0
def test_get_subunit_path_list():
    unit_info_file = os.path.join(
        os.path.dirname(text_data.__file__),
        "veda/taittirIya/bhAShya/bhaTTa-bhAskara/saMhitA.json")
    assert text_data.get_subunit_path_list(
        json_file=unit_info_file,
        unit_path_list=[]) == [[1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6],
                               [1, 7], [1, 8], [2, 1], [2, 2], [2, 3], [2, 4],
                               [2, 5], [2, 6], [3, 1], [3, 2], [3, 3], [3, 4],
                               [3, 5], [4, 1], [4, 2], [4, 3], [4, 4], [4, 5],
                               [4, 6], [4, 7], [5, 1], [5, 2], [5, 3], [5, 4],
                               [5, 5], [5, 6], [5, 7], [6, 1], [6, 2], [6, 3],
                               [6, 4], [6, 5], [6, 6], [7, 1], [7, 2], [7, 3],
                               [7, 4], [7, 5]]