def get_text(text_id, base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaarata/kumbhakonam.json") if text_id == "BORI": unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaarata/bori.json") else: unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaarata/kumbhakonam.json") for book_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): book_index = "%02d" % book_index chapter_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[book_index]) book_data = text_data.get_subunit_data(json_file=unit_info_file, unit_path_list=[book_index]) for chapter_index in chapter_list: infile_path = "http://mahabharata.manipal.edu/browse/%s/%s/%d.txt" % ( book_data["alt_title"].lower(), text_id, chapter_index) outfile_path = os.path.join(base_dir, str(book_index), "%03d.md" % chapter_index) logging.info("Book %s chapter %d url: %s outpath: %s", book_index, chapter_index, infile_path, outfile_path) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: resource = urllib.request.urlopen(infile_path) content = resource.read().decode("utf-8") outfile.writelines([content])
def dump_text(base_dir, do_transliteration=False): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "shatapatha.json") titus_url = "http://titus.uni-frankfurt.de/texte/etcs/ind/aind/ved/yvw/sbm/sbm.htm" for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): sarga_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index]) for sarga_index in sarga_list: logging.info("kaanDa %d adhyaaya %d", kaanda_index, sarga_index) outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%02d" % sarga_index + ".md") if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue titus.navigate_to_part(base_page_url=titus_url, level_3_id=kaanda_index, level_4_id=sarga_index) sentences = titus.get_text() lines = ["\n"] for sentence in sentences: sentence = roman.RomanScheme.simplify_accent_notation(sentence) sentence = sentence.replace("/", ".") if not sentence.endswith("."): sentence = sentence + ".." if do_transliteration: if kaanda_index == 12: sentence = sanscript.transliterate(sentence, sanscript.IAST, sanscript.DEVANAGARI) else: sentence = sanscript.transliterate(sentence, sanscript.TITUS, sanscript.DEVANAGARI) sentence = roman.RomanScheme.to_shatapatha_svara(sentence) lines.append(sentence + " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(lines)
def dump_text(base_dir): opts = options.Options() opts.headless = False browser = webdriver.Chrome(options=opts) browser.implicitly_wait(6) unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/shaunaka/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): subunit_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index]) for subunit_index in subunit_list: logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index) outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % ( kaanda_index, subunit_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title_tags = browser.find_elements_by_css_selector("#videotext strong") title = "%03d" % subunit_index if len(title_tags) > 0: title = "%03d %s" % (subunit_index, title_tags[0].text) title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, content=text, dry_run=False) browser.close()
def test_get_subunit_list(): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "shatapatha.json") assert text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]) == range(1, 15) assert text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[2]) == range(1, 7)
def dump_text(base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "veda/shaunaka/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): subunit_list = text_data.get_subunit_list( json_file=unit_info_file, unit_path_list=[kaanda_index]) for subunit_index in subunit_list: logging.info("kaanDa %d adhyaaya %d", kaanda_index, subunit_index) outfile_path = os.path.join(base_dir, "%02d" % (kaanda_index), "%03d.md" % subunit_index) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/atharvaveda-samhitas/shaunaka-samhita/kanda-%02d-sukta-%03d/" % ( kaanda_index, subunit_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: logging.debug(text) outfile.write(text)
def get_text(text_id, base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/kumbhakonam.json") if text_id == "BORI": unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/bori.json") elif text_id == "KK": unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/kumbhakonam.json") elif text_id == "SV": unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "mahaabhaaratam/vAvilla.json") for book_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): book_index = "%02d" % book_index chapter_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[book_index]) for chapter_index in chapter_list: infile_path = "http://mahabharata.manipal.edu/anu-projects/MAHE/apiphpv5/readMaha2.php?src=%s&parva=%s&adh=%03d" % (text_id, book_index, chapter_index) outfile_path = os.path.join(base_dir, str(book_index), "%03d.md" % chapter_index) if os.path.exists(outfile_path): logging.warning("Skipping " + outfile_path) continue logging.info("Book %s chapter %d url: %s outpath: %s", book_index, chapter_index, infile_path, outfile_path) os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) resource = urllib.request.urlopen(infile_path) content = resource.read().decode("utf-8") chapter_lines = [line["text"] + " \n" for line in json.loads(content)] if len(chapter_lines) > 0: with open(outfile_path, "w") as outfile: outfile.writelines(chapter_lines) else: logging.error("No lines found for %s:%s-%03d", text_id, book_index, chapter_index)
def dump_text(base_dir): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/vAjasaneyi/samhitA.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): logging.info("adhyAya %d", kaanda_index) outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index)) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue url = "http://vedicheritage.gov.in/samhitas/yajurveda/shukla-yajurveda/vajasaneyi-kanva-samhita-chapter-%02d/" % ( kaanda_index) logging.info("url %s to %s", url, outfile_path) browser.get(url=url) try: text = browser.find_element_by_id("videotext").text text = text.replace("\n", " \n") title = "%02d" % kaanda_index title = sanscript.transliterate(title, sanscript.HK, sanscript.DEVANAGARI) md_file = MdFile(file_path=outfile_path) md_file.dump_to_file(metadata={"title": title}, md=text, dry_run=False) except NoSuchElementException: logging.warning("Page missing! %s ", url)
def get_ramayana_text(browser, text_id, base_dir): browser.find_element_by_link_text(text_id).click() # browser.implicitly_wait(2) unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "raamaayana/andhra.json") if text_id == "रामायणम्-नव्यपाठः": unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "raamaayana/baroda.json") else: unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "raamaayana/kumbhakonam.json") for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): kaanda_element = browser.find_element_by_link_text("Kanda-%d" % kaanda_index) # kaanda_element.click() # Sometimes headless browser fails with selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted . Then, non-headless browser works fine! Or can try https://stackoverflow.com/questions/48665001/can-not-click-on-a-element-elementclickinterceptedexception-in-splinter-selen browser.execute_script("arguments[0].click();", kaanda_element) sarga_list = text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[kaanda_index]) for sarga_index in sarga_list: logging.info("Kanda %d Sarga %d", kaanda_index, sarga_index) outfile_path = os.path.join(base_dir, str(kaanda_index), "%03d" % sarga_index + ".md") if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue browser.find_element_by_link_text("Sarga-%d" % sarga_index).click() text_spans = browser.find_element_by_id( "divResults").find_elements_by_tag_name("span") lines = ["\n", "\n"] for span in text_spans: shloka = span.text shloka = shloka.replace("। ", "। \n") shloka = shloka.replace("।।", " ॥ ") lines.append(shloka + " \n") os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.writelines(lines) # Close the kANDa - else the driver may pick sarga from this kANDa when it is to pick the sarga from the next kANDa?! browser.find_element_by_link_text("Kanda-%d" % kaanda_index).click()
def dump_text(base_dir, do_transliteration=False): unit_info_file = os.path.join(os.path.dirname(text_data.__file__), "vedaH/vAjasaneyi/samhitA.json") titus_url = "http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvw/vs/vs.htm" for kaanda_index in text_data.get_subunit_list(json_file=unit_info_file, unit_path_list=[]): logging.info("kaanDa %d", kaanda_index) outfile_path = os.path.join(base_dir, "%02d.md" % (kaanda_index)) if os.path.exists(outfile_path): logging.info("Skipping " + outfile_path) continue titus.navigate_to_part(base_page_url=titus_url, level_3_id=kaanda_index, level_3_frame="etaindexb") sentences = titus.dump_text() os.makedirs(name=os.path.dirname(outfile_path), exist_ok=True) with open(outfile_path, "w") as outfile: outfile.write(" \n".join(sentences))