示例#1
0
文件: almizan.py 项目: sobhe/zolal
def aya_tokens(aya):
    parts = simple_aya(aya['text']).replace('  ', ' ').split(' ')
    raw_ayas = aya['raw'].split(' ')
    normalize_token = lambda s: s.replace('آ', 'ا').replace('ء', '').replace(
        'ئ', '').replace('أ', 'ا').replace('إ', 'ا').replace('ؤ', 'و')
    tokens = [{
        'word': word,
        'stem': isri.stem(word),
        'id': parts.index(word) + 1
    } for word in raw_ayas if word in parts]
    not_found_words = [word for word in raw_ayas if word not in parts]
    not_found_parts = [part for part in parts if not part in raw_ayas]
    start = -1
    for word in not_found_words:
        for part in not_found_parts:
            if not_found_parts.index(part) > start:
                if normalize_token(word).replace(
                        'ا', '') == normalize_token(part).replace(
                            'ا', '') or normalize_token(word).replace(
                                'و', 'ا') == normalize_token(part).replace(
                                    'و', 'ا') or normalize_token(word).replace(
                                        'ی',
                                        'ا') == normalize_token(part).replace(
                                            'ی', 'ا'):
                    found_ids = [token['id'] for token in tokens]
                    k = found_ids[found_ids.index(
                        parts.index(part) +
                        1)] if parts.index(part) + 1 in found_ids else 0
                    tokens.append({
                        'word': word,
                        'stem': isri.stem(word),
                        'id': parts.index(part, k) + 1
                    })
                    start = not_found_parts.index(part)
                    break

    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens)):
            if tokens[i]['id'] == tokens[j]['id']:
                try:
                    tokens[j]['id'] = parts.index(tokens[j]['word'],
                                                  tokens[i]['id']) + 1
                except ValueError as e:
                    print(str(e))
    return tokens
示例#2
0
文件: almizan.py 项目: khajavi/zolal
def refine_section(section):

    # ayas
    for item in section.find(".aya").items():
        text = simple_aya(item.text())
        if text.startswith("(") and text.startswith("("):
            text = text[1:-1]
        item.text(text)

        # structure
    refine_translation(section)
    for item in section.children().items():
        if item[0].tag == "p":
            if len(item.text().strip()) <= 1:
                item.remove()
            else:
                if len(item.find(".trans")) >= 1:
                    for span in section.find(".trans").items():
                        item.append(span.outerHtml())
                        span.remove()
示例#3
0
文件: almizan.py 项目: sobhe/zolal
def refine_section(section):

    # ayas
    for item in section.find('.aya').items():
        text = simple_aya(item.text())
        if text.startswith('(') and text.startswith('('):
            text = text[1:-1]
        item.text(text)

    # structure
    refine_translation(section)
    for item in section.children().items():
        if item[0].tag == 'p':
            if len(item.text().strip()) <= 1:
                item.remove()
            else:
                if len(item.find('.trans')) >= 1:
                    for span in section.find('.trans').items():
                        item.append(span.outerHtml())
                        span.remove()
示例#4
0
文件: almizan.py 项目: khajavi/zolal
def aya_tokens(aya):
    parts = simple_aya(aya["text"]).replace("  ", " ").split(" ")
    raw_ayas = aya["raw"].split(" ")
    normalize_token = (
        lambda s: s.replace("آ", "ا")
        .replace("ء", "")
        .replace("ئ", "")
        .replace("أ", "ا")
        .replace("إ", "ا")
        .replace("ؤ", "و")
    )
    tokens = [
        {"word": word, "stem": isri.stem(word), "id": parts.index(word) + 1} for word in raw_ayas if word in parts
    ]
    not_found_words = [word for word in raw_ayas if word not in parts]
    not_found_parts = [part for part in parts if not part in raw_ayas]
    start = -1
    for word in not_found_words:
        for part in not_found_parts:
            if not_found_parts.index(part) > start:
                if (
                    normalize_token(word).replace("ا", "") == normalize_token(part).replace("ا", "")
                    or normalize_token(word).replace("و", "ا") == normalize_token(part).replace("و", "ا")
                    or normalize_token(word).replace("ی", "ا") == normalize_token(part).replace("ی", "ا")
                ):
                    found_ids = [token["id"] for token in tokens]
                    k = found_ids[found_ids.index(parts.index(part) + 1)] if parts.index(part) + 1 in found_ids else 0
                    tokens.append({"word": word, "stem": isri.stem(word), "id": parts.index(part, k) + 1})
                    start = not_found_parts.index(part)
                    break

    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens)):
            if tokens[i]["id"] == tokens[j]["id"]:
                try:
                    tokens[j]["id"] = parts.index(tokens[j]["word"], tokens[i]["id"]) + 1
                except ValueError as e:
                    print(str(e))
    return tokens
示例#5
0
文件: almizan.py 项目: khajavi/zolal
def resolve_phrase(phrase, tokens, book):
    phrase = simple_aya(phrase.strip()).replace("‌", "").replace("ّ", "")
    if len(phrase) < 3:
        return None

    normalize_Alif_lam = lambda s: s[2:] if (s[:2] == "ال") else s
    normalize_arabic_letter = lambda s: s.replace("ة", "ه").replace("ؤ", "و").replace("إ", "ا").replace("أ", "ا")
    normalize_LBKF = lambda s: s[1:] if (s[:1] in "لبکف") else s

    matchings = [
        lambda token: phrase == token["word"],  # exact
        lambda token: normalize_arabic_letter(phrase)
        == normalize_arabic_letter(token["word"]),  # without arabic letters
        lambda token: normalize_Alif_lam(phrase) == normalize_Alif_lam(token["word"]),  # without Alif-lam
        lambda token: normalize_arabic_letter(normalize_Alif_lam(phrase))
        == normalize_arabic_letter(normalize_Alif_lam(token["word"])),  # without arabic letters and Alif-lam
        lambda token: normalize_arabic_letter(normalize_LBKF(phrase))
        == normalize_arabic_letter(normalize_LBKF(token["word"])),
        lambda token: isri.stem(phrase) == token["stem"],  # stemed
    ]

    matchings2 = [
        lambda token, i: phrase.split()[i] == token["word"],  # exact
        lambda token, i: normalize_arabic_letter(phrase.split()[i])
        == normalize_arabic_letter(token["word"]),  # without arabic letters
        lambda token, i: normalize_Alif_lam(phrase.split()[i]) == normalize_Alif_lam(token["word"]),  # without Alif-lam
        lambda token, i: normalize_arabic_letter(normalize_Alif_lam(phrase.split()[i]))
        == normalize_arabic_letter(normalize_Alif_lam(token["word"])),  # without arabic letters and Alif-lam
        lambda token, i: normalize_arabic_letter(normalize_LBKF(phrase.split()[i]))
        == normalize_arabic_letter(normalize_LBKF(token["word"])),
        lambda token, i: isri.stem(phrase.split()[i]) == token["stem"],  # stemed
    ]

    matched = []
    for aya, token_list in tokens.items():
        for token in token_list:
            for match in matchings:
                if match(token):
                    matched.append(("{0}_{1}_{2}-{2}".format(book, aya, token["id"]), token["word"]))
                    break
    if len(matched) == 1:
        return matched[0]

    matched = []
    if len(phrase.split()) == 2:
        for aya, token_list in tokens.items():
            for token1 in token_list:
                for token2 in token_list:
                    if token2["id"] == token1["id"] + 1:
                        for match1 in matchings2:
                            if match1(token1, 0):
                                for match2 in matchings2:
                                    if match2(token2, 1):
                                        matched.append(
                                            (
                                                "{0}_{1}_{2}-{3}".format(book, aya, token1["id"], token2["id"]),
                                                "{0} {1}".format(token1["word"], token2["word"]),
                                            )
                                        )
                                        break
                                break

    if len(matched) == 1:
        return matched[0]
    return None
示例#6
0
文件: almizan.py 项目: sobhe/zolal
def resolve_phrase(phrase, tokens, book):
    phrase = simple_aya(phrase.strip()).replace('‌', '').replace('ّ', '')
    if len(phrase) < 3:
        return None

    normalize_Alif_lam = lambda s: s[2:] if (s[:2] == 'ال') else s
    normalize_arabic_letter = lambda s: s.replace('ة', 'ه').replace(
        'ؤ', 'و').replace('إ', 'ا').replace('أ', 'ا')
    normalize_LBKF = lambda s: s[1:] if (s[:1] in 'لبکف') else s

    matchings = [
        lambda token: phrase == token['word'],  # exact
        lambda token: normalize_arabic_letter(phrase) ==
        normalize_arabic_letter(token['word']),  # without arabic letters
        lambda token: normalize_Alif_lam(phrase) == normalize_Alif_lam(token[
            'word']),  # without Alif-lam
        lambda token: normalize_arabic_letter(normalize_Alif_lam(phrase)) ==
        normalize_arabic_letter(normalize_Alif_lam(token['word'])
                                ),  # without arabic letters and Alif-lam
        lambda token: normalize_arabic_letter(normalize_LBKF(
            phrase)) == normalize_arabic_letter(normalize_LBKF(token['word'])),
        lambda token: isri.stem(phrase) == token['stem']  # stemed
    ]

    matchings2 = [
        lambda token, i: phrase.split()[i] == token['word'],  # exact
        lambda token, i: normalize_arabic_letter(phrase.split()[i]) ==
        normalize_arabic_letter(token['word']),  # without arabic letters
        lambda token, i: normalize_Alif_lam(phrase.split()[
            i]) == normalize_Alif_lam(token['word']),  # without Alif-lam
        lambda token, i: normalize_arabic_letter(
            normalize_Alif_lam(phrase.split()[i])) == normalize_arabic_letter(
                normalize_Alif_lam(token['word'])
            ),  # without arabic letters and Alif-lam
        lambda token, i: normalize_arabic_letter(
            normalize_LBKF(phrase.split()[i])) == normalize_arabic_letter(
                normalize_LBKF(token['word'])),
        lambda token, i: isri.stem(phrase.split()[i]) == token['stem'
                                                               ]  # stemed
    ]

    matched = []
    for aya, token_list in tokens.items():
        for token in token_list:
            for match in matchings:
                if match(token):
                    matched.append(
                        ('{0}_{1}_{2}-{2}'.format(book, aya,
                                                  token['id']), token['word']))
                    break
    if len(matched) == 1:
        return matched[0]

    matched = []
    if len(phrase.split()) == 2:
        for aya, token_list in tokens.items():
            for token1 in token_list:
                for token2 in token_list:
                    if token2['id'] == token1['id'] + 1:
                        for match1 in matchings2:
                            if match1(token1, 0):
                                for match2 in matchings2:
                                    if match2(token2, 1):
                                        matched.append(
                                            ('{0}_{1}_{2}-{3}'.format(
                                                book, aya, token1['id'],
                                                token2['id']),
                                             '{0} {1}'.format(
                                                 token1['word'],
                                                 token2['word'])))
                                        break
                                break

    if len(matched) == 1:
        return matched[0]
    return None