def dump_deep_text(url_text_id, url_leaf_id_padding, dir_path, unit_info_file, get_collapsible_content=False, dry_run=False): unit_data = text_data.get_subunit_data(unit_info_file, []) for subunit_path in text_data.get_subunit_path_list( json_file=unit_info_file, unit_path_list=[]): relative_dir_path = "/".join(["%02d" % x for x in subunit_path[:-1]]) outfile_path = os.path.join(dir_path, relative_dir_path, "%03d.md" % subunit_path[-1]) import urllib item_url = "https://sa.wikisource.org/wiki/%s/%s" % ( urllib.parse.quote(url_text_id), get_wiki_path(subunit_path=subunit_path, unit_data=unit_data, url_id_padding=url_leaf_id_padding)) title = sanscript.transliterate("%03d" % subunit_path[-1], sanscript.SLP1, sanscript.DEVANAGARI) logging.info("Getting %s to %s with title %s", item_url, outfile_path, title) if not dry_run: dump_item(title=title, outfile_path=outfile_path, item_url=item_url, get_collapsible_content=get_collapsible_content)
def get_text(browser, text_id, base_dir, unit_info_file): parankusha.click_link_by_text(browser=browser, element_text=text_id) unit_data = text_data.get_subunit_data(unit_info_file, []) for subunit_path in text_data.get_subunit_path_list(json_file=unit_info_file, unit_path_list=[]): try: open_path(subunit_path=subunit_path, unit_data=unit_data) except NoSuchElementException as e: close_path(subunit_path=subunit_path, unit_data=unit_data) exit() logging.warning("Skipping as Could not find element " + str(traceback.format_exc())) continue outfile_path = os.path.join(base_dir, "/".join(map(str, subunit_path)) + ".md") if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) else: text_spans = browser.find_element_by_id("divResults").find_elements_by_tag_name("span") lines = ["\n", "\n"] for span in text_spans: lines.append(span.text + " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(lines) # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?! close_path(subunit_path=subunit_path, unit_data=unit_data)
def get_structured_text(browser, start_nodes, base_dir, unit_info_file): def open_path(subunit_path, unit_data): logging.debug(list(zip(subunit_path, unit_data["unitNameListInSite"]))) for (subunit, unitNameInSite) in zip(subunit_path, unit_data["unitNameListInSite"]): element_text = "%s%d" % (unitNameInSite, subunit) click_link_by_text(browser=browser, element_text=element_text) def close_path(subunit_path, unit_data): logging.info( list( zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"])))) for (subunit, unitNameInSite) in list( zip(reversed(subunit_path), reversed(unit_data["unitNameListInSite"]))): element_text = "%s%d" % (unitNameInSite, subunit) logging.info(element_text) click_link_by_text(browser=browser, element_text=element_text) browse_nodes(browser=browser, start_nodes=start_nodes) os.makedirs(name=base_dir, exist_ok=True) unit_data = text_data.get_subunit_data(unit_info_file, []) for subunit_path in text_data.get_subunit_path_list( json_file=unit_info_file, unit_path_list=[]): try: open_path(subunit_path=subunit_path, unit_data=unit_data) except NoSuchElementException as e: close_path(subunit_path=subunit_path, unit_data=unit_data) exit() logging.warning("Skipping as Could not find element " + str(traceback.format_exc())) continue outfile_path = os.path.join(base_dir, "/".join(map(str, subunit_path)) + ".md") if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) else: text_spans = browser.find_element_by_id( "divResults").find_elements_by_tag_name("span") lines = ["\n", "\n"] for span in text_spans: lines.append(span.text + " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(lines) # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?! close_path(subunit_path=subunit_path, unit_data=unit_data) md_helper.MdFile.fix_index_files(dir_path=base_dir)
def test_get_subunit_path_list(): unit_info_file = os.path.join( os.path.dirname(text_data.__file__), "veda/taittirIya/bhAShya/bhaTTa-bhAskara/saMhitA.json") assert text_data.get_subunit_path_list( json_file=unit_info_file, unit_path_list=[]) == [[1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [1, 8], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [3, 1], [3, 2], [3, 3], [3, 4], [3, 5], [4, 1], [4, 2], [4, 3], [4, 4], [4, 5], [4, 6], [4, 7], [5, 1], [5, 2], [5, 3], [5, 4], [5, 5], [5, 6], [5, 7], [6, 1], [6, 2], [6, 3], [6, 4], [6, 5], [6, 6], [7, 1], [7, 2], [7, 3], [7, 4], [7, 5]]