def add_hadith(chapter: Chapter, hadith_ar: List[str], hadith_en: List[str], part_type: PartType = PartType.Hadith): hadith = Verse() hadith.part_type = part_type hadith.text = hadith_ar text_en = [END_OF_HADITH_CLEANUP_PATTERN.sub('', txt) for txt in hadith_en] hadith.translations = {} hadith.translations[HUBEALI_TRANSLATION_ID] = text_en chapter.verses.append(hadith)
def build_verses(file): logger.info("Adding Quran file %s", file) index = 0 verses = [] with open(file, 'r', encoding='utf8') as qfile: for line in qfile.readlines(): text = line.strip() if text and not text.startswith('#'): index = index + 1 verse = Verse() verse.part_type = PartType.Verse # verse.index=index verse.text = [text] verse.translations = {} verses.append(verse) return verses
def build_alhassanain_baabs(file) -> List[Chapter]: baabs: List[Chapter] = [] logger.info("Adding Al-Kafi file %s", file) translation = Translation() translation.name = "HubeAli.com" translation.lang = Language.EN.value translation.id = HUBEALI_TRANSLATION_ID with open(file, 'r', encoding='utf8') as qfile: inner_html = qfile.read() sections = inner_html.split("<br clear=all>") for section in sections: section_soup = BeautifulSoup(section, 'html.parser') headings = section_soup.select(".Heading1Center") if not headings: continue # process "the book of" chapter baab_titles = extract_headings(headings) en_title = baab_titles[Language.EN.value] baab = None for existing_baab in baabs: if existing_baab.titles[Language.EN.value] == en_title: baab = existing_baab if not baab: baab = Chapter() baab.part_type = PartType.Book baab.titles = baab_titles baab.chapters = [] baabs.append(baab) # process chapters chapters = section_soup.select(".Heading2Center") chapters_len = len(chapters) for subchapter_index in range(math.ceil(chapters_len / 2)): subchapter_heading_index = subchapter_index * 2 remaining_chapters = chapters[subchapter_heading_index:] if len(remaining_chapters) > 1: remaining_chapters = remaining_chapters[:2] chapter_titles = extract_headings(remaining_chapters) chapter = Chapter() chapter.part_type = PartType.Chapter chapter.titles = chapter_titles chapter.verse_translations = [translation] chapter.verses = [] baab.chapters.append(chapter) last_element = remaining_chapters[-1] last_element = last_element.next_sibling verse: Verse = None while (last_element is not None and (isinstance(last_element, NavigableString) or (is_tag(last_element) and 'Heading2Center' not in last_element['class']))): is_a_tag = is_tag(last_element) if is_a_tag and 'libAr' in last_element['class']: # push the last verse if its not the start of chapter if verse != None: chapter.verses.append(verse) verse = Verse() verse.part_type = PartType.Hadith verse.translations = {} verse.translations[HUBEALI_TRANSLATION_ID] = [] verse.text = [last_element.get_text(strip=True)] if is_a_tag and 'libNormal' in last_element['class']: verse.translations[HUBEALI_TRANSLATION_ID].append( last_element.get_text(strip=True)) last_element = last_element.next_sibling if verse != None: chapter.verses.append(verse) return baabs
def add_chapter_content(chapter: Chapter, filepath, hadith_index=0): if filepath.endswith('\\0.html'): error_msg = f"Skipping zero file {filepath}" logger.warn(error_msg) SEQUENCE_ERRORS.append(error_msg) return verses = chapter.verses heading_count = len([x for x in verses if x.part_type == PartType.Heading]) sarwar_exists = next((item for item in chapter.verse_translations if item.id == SARWAR_TRANSLATION_ID), None) if not sarwar_exists: chapter.verse_translations.append(sarwar_translation) with open(filepath, 'r', encoding='utf8') as qfile: file_html = qfile.read() if not 'en' in chapter.titles: file_soup = BeautifulSoup(file_html, 'html.parser') card_body = file_soup.find('div', 'card-body') chapter_title = get_contents(card_body.find('h3')) chapter.titles['en'] = chapter_title ##### Processing each hadith separately hadith_htmls = re.split('<hr/?>', file_html) for hadith_html in hadith_htmls: if we_dont_care(hadith_html): continue soup = BeautifulSoup(hadith_html, 'html.parser') all_paras = soup.find_all('p') para_index = 0 hadith_ar = [] while is_rtl_tag(all_paras[para_index]): hadith_ar.append(get_contents(all_paras[para_index])) para_index += 1 hadith_en = get_contents(all_paras[para_index]) para_index += 1 if hadith_index >= len(verses) - heading_count: # hubeali rightly splits first chapter in book of inheritance into two # but thaqalayn.net has it as one chapter, so we'll skip adding ahadith if chapter.path == '/books/al-kafi:7:2:1': break verse = Verse() verse.text = hadith_ar verse.part_type = PartType.Hadith.value verse.translations = {} verses.append(verse) site_path = sitepath_from_filepath(filepath) if chapter.crumbs: my_site_path = chapter.crumbs[-1].path else: my_site_path = site_path.replace('/', ':') error_msg = f"Appending new hadith from Sarwar to hubeali, hadith #{hadith_index+1} from https://thaqalayn.net/chapter/{site_path} to https://thaqalayn.netlify.app/#{my_site_path}" logger.warn(error_msg) SEQUENCE_ERRORS.append(error_msg) else: # TODO: create new verse if the verse at this index doesn't match the one being inserted # perhaps use https://github.com/ztane/python-Levenshtein or https://pypi.org/project/jellyfish/ verse = verses[hadith_index] if verse.part_type == PartType.Heading: hadith_index += 1 verse = verses[hadith_index] if verse.part_type != PartType.Hadith: error_msg = f"Hadith index {hadith_index} is of part_type {verse.part_type} in https://thaqalayn.netlify.app/#{chapter.crumbs[-1].path}" logger.warn(error_msg) SEQUENCE_ERRORS.append(error_msg) verse.translations[SARWAR_TRANSLATION_ID] = [hadith_en] if len(all_paras) > para_index + 1: grading_title = get_contents(all_paras[para_index]) para_index += 1 if grading_title.startswith('Grading:'): grading = [] # if len(all_paras[3:-3]) != 2 and len(all_paras[3:-3]) != 1: # raise Exception("We are in " + filepath + " and all_paras is " + str(all_paras)) for grading_para in all_paras[para_index:-3]: grading.append(get_contents(grading_para)) verse.gradings = grading hadith_index += 1 # Volume 8 of al-kafi is one file per hadith on thaqalayn.net and it'll warn on every page # since there is always more ahadith on hubeali's chapter if hadith_index != len( verses) - heading_count and 'al-kafi:8:1' not in chapter.path: site_path = sitepath_from_filepath(filepath) error_msg = f"Sarwar has {hadith_index} hadith but hubeali has {len(verses)} hadith: https://thaqalayn.net/chapter/{site_path} vs https://thaqalayn.netlify.app/#{chapter.crumbs[-1].path}" logger.warn(error_msg) SEQUENCE_ERRORS.append(error_msg)