Пример #1
0
def is_candidate(tokens, index):
    checking_text = " ".join(
        map(lambda i: tokens[i], xrange(index + 1, len(tokens))))
    for i in range(0, index):
        for j in str.split(tokens[i], "_"):
            if j is not None and j.isalpha():
                if handle_string.toLowerCase(j) in checking_text:
                    return True
    return False
Пример #2
0
def extract(doc_id="text", header_text="text", title="text"):
    temp = ""
    if re.search(r'(sửa đổi|bổ sung)', handle_string.toLowerCase(title), re.U):
        check_symbol = re.search(
            r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)',
            title, re.U | re.I)
        if check_symbol is not None:
            yield [
                doc_id,
                (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+',
                           check_symbol.group(), re.U | re.I)).group(),
                findDate(title[check_symbol.end(0):])
            ]
        else:
            get_content = re.finditer(
                re.escape(handle_string.toUpperCase(title.strip())) +
                r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)',
                handle_string.toUpperCase(header_text), re.U | re.I)
            if divlaw.lenIterator(get_content) > 0:
                get_content = re.finditer(
                    re.escape(handle_string.toUpperCase(title.strip())) +
                    r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)',
                    handle_string.toUpperCase(header_text), re.U | re.I)
                for i in get_content:
                    break
                yield [
                    doc_id,
                    (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+',
                               i.group(0), re.U | re.I)).group(0),
                    findDate(title[i.end():])
                ]
            else:
                getTitleModified = re.finditer(r'của\s', title, re.U | re.I)
                if divlaw.lenIterator(getTitleModified) > 0:
                    getTitleModified = re.finditer(r'của\s', title,
                                                   re.U | re.I)
                    for i in getTitleModified:
                        break
                    temp = title[i.end():]
                    get_content = re.finditer(
                        re.escape(handle_string.toUpperCase(title)),
                        handle_string.toUpperCase(header_text), re.U | re.I)
                    if divlaw.lenIterator(get_content) > 0:
                        get_content = re.finditer(
                            re.escape(handle_string.toUpperCase(title)),
                            handle_string.toUpperCase(header_text),
                            re.U | re.I)
                        for i in get_content:
                            pass
                        yield [doc_id, temp, findDate(header_text[i.end():])]
                    else:
                        yield [doc_id, temp, None]
Пример #3
0
def extract(law_id="text",
            part_index="int",
            chap_index="int",
            sec_index="int",
            law_index="int",
            item_index="int",
            point_index="int",
            numerical_symbol="text",
            titles="text",
            content="text",
            location_content="int",
            count="int"):
    titles = handle_string.toLowerCase(titles)
    titles = rewriteString(titles)
    a = divTitle(titles)
    for title in a:
        findType = re.finditer(r"(.+vào.+)|(.+(sau|trước).{7,})", title, re.U)
        if divlaw.lenIterator(findType) > 0:
            type_modify = 2
        else:
            type_modify = 1
    match = re.finditer(
        r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})",
        content, re.DOTALL)
    quotesIndex = []
    for i in match:
        quotesIndex.append(i.start())
    for j in range(len(quotesIndex)):
        if type_modify == 1:
            divModify = divlaw.divPartModifyLaw(content)
            if j != (len(quotesIndex) - 1):
                divModify = divlaw.divPartModifyLaw(
                    content[quotesIndex[j]:quotesIndex[j + 1]])
            else:
                divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:])
            totalPart = divlaw.getTotalPart(divModify)
            if (totalPart == 0):
                totalPart = 1
            for part_id in range(0, totalPart):
                part = divlaw.getPart(divModify, part_id)
                if part['name'] != "":
                    part_name = handle_string.toLowerCase(part['name'])
                    if part_name in title:
                        yield [
                            law_id, part_index, chap_index, sec_index,
                            law_index, item_index, point_index, part_name,
                            None, None, None, None, None, type_modify
                        ]
                        continue
                totalChap = divlaw.getTotalChapter(divModify, part_id)
                if totalChap == 0:
                    totalChap = 1
                for chap_id in range(0, totalChap):
                    chap = divlaw.getChapter(divModify, part_id, chap_id)
                    if chap['name'] != "":
                        chap_name = handle_string.toLowerCase(chap['name'])
                        if chap_name in title:
                            part_name = None
                            findName = re.finditer(
                                r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                title)
                            if divlaw.lenIterator(findName) > 0:
                                findName = re.finditer(
                                    r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                    title)
                                for fN in findName:
                                    part_name = title[fN.span()[0]:fN.span(
                                    )[1]]
                                    break
                            yield [
                                law_id, part_index, chap_index, sec_index,
                                law_index, item_index, point_index, part_name,
                                chap_name, None, None, None, None, type_modify
                            ]
                            continue
                    totalSec = divlaw.getTotalSection(divModify, part_id,
                                                      chap_id)
                    if totalSec == 0:
                        totalSec = 1
                    for sec_id in range(0, totalSec):
                        sec = divlaw.getSection(divModify, part_id, chap_id,
                                                sec_id)
                        if sec['name'] != "":
                            sec_name = handle_string.toLowerCase(sec['name'])
                            if sec_name in title:
                                part_name = None
                                findName = re.finditer(
                                    r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                    title)
                                if divlaw.lenIterator(findName) > 0:
                                    findName = re.finditer(
                                        r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                        title)
                                    for fN in findName:
                                        part_name = title[fN.span()[0]:fN.span(
                                        )[1]]
                                        break
                                chap_name = None
                                findName = re.finditer(
                                    r"(chương)\s([A-Z]|[0-9])+", title)
                                if divlaw.lenIterator(findName) > 0:
                                    findName = re.finditer(
                                        r"(chương)\s([A-Z]|[0-9])+", title)
                                    for fN in findName:
                                        chap['name'] = title[fN.span()[0]:fN.
                                                             span()[1]]
                                        break
                                yield [
                                    law_id, part_index, chap_index, sec_index,
                                    law_index, item_index, point_index,
                                    part_name, chap_name, sec_name, None, None,
                                    None, type_modify
                                ]
                                continue
                        totalLaw = divlaw.getTotalLaw(divModify, part_id,
                                                      chap_id, sec_id)
                        if totalLaw == 0:
                            totalLaw = 1
                        for law_index in range(0, totalSec):
                            law = divlaw.getLaw(divModify, part_id, chap_id,
                                                sec_id, law_index)
                            if law['name'] != "":
                                law_name = handle_string.toLowerCase(
                                    law['name'])
                                if law_name in title:
                                    part_name = None
                                    findName = re.finditer(
                                        r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                        title)
                                    if divlaw.lenIterator(findName) > 0:
                                        findName = re.finditer(
                                            r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                            title)
                                        for fN in findName:
                                            part_name = title[fN.span()[0]:fN.
                                                              span()[1]]
                                            break
                                    chap_name = None
                                    findName = re.finditer(
                                        r"(chương)\s([A-Z]|[0-9])+", title)
                                    if divlaw.lenIterator(findName) > 0:
                                        findName = re.finditer(
                                            r"(chương)\s([A-Z]|[0-9])+", title)
                                        for fN in findName:
                                            chap_name = title[fN.span()[0]:fN.
                                                              span()[1]]
                                            break
                                    sec_name = None
                                    findName = re.finditer(
                                        r"(mục)\s([A-Z]|[0-9])+", title)
                                    if divlaw.lenIterator(findName) > 0:
                                        findName = re.finditer(
                                            r"(mục)\s([A-Z]|[0-9])+", title)
                                        for fN in findName:
                                            sec_name = title[fN.span()[0]:fN.
                                                             span()[1]]
                                            break
                                    yield [
                                        law_id, part_index, chap_index,
                                        sec_index, law_index, item_index,
                                        point_index, part_name, chap_name,
                                        sec_name, law_name, None, None,
                                        type_modify
                                    ]
                                    continue
                            totalItem = divlaw.getTotalItem(
                                divModify, part_id, chap_id, sec_id, law_index)
                            if totalItem == 0:
                                totalItem = 1
                            for item_id in range(0, totalItem):
                                item = divlaw.getItem(divModify, part_id,
                                                      chap_id, sec_id,
                                                      law_index, item_id)
                                if item['name'] != "":
                                    item_name = 'khoản ' + item['name']
                                    if item_name in title:
                                        find_item_name = re.finditer(
                                            r"khoản\s" + item['name'], title,
                                            re.U)
                                        ex = getFirst(find_item_name)
                                        index_start = ex.end()
                                        part_name = None
                                        findName = re.finditer(
                                            r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                            title[index_start:])
                                        if divlaw.lenIterator(findName) > 0:
                                            findName = re.finditer(
                                                r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                                title[index_start:])
                                            for fN in findName:
                                                part_name = title[
                                                    index_start +
                                                    fN.span()[0]:index_start +
                                                    fN.span()[1]]
                                                break
                                        chap_name = None
                                        findName = re.finditer(
                                            r"(chương)\s([A-Z]|[0-9])+",
                                            title[index_start:])
                                        if divlaw.lenIterator(findName) > 0:
                                            findName = re.finditer(
                                                r"(chương)\s([A-Z]|[0-9])+",
                                                title[index_start:])
                                            for fN in findName:
                                                chap_name = title[
                                                    index_start +
                                                    fN.span()[0]:index_start +
                                                    fN.span()[1]]
                                                break
                                        sec_name = None
                                        findName = re.finditer(
                                            r"(mục)\s([A-Z]|[0-9])+",
                                            title[index_start:])
                                        if divlaw.lenIterator(findName) > 0:
                                            findName = re.finditer(
                                                r"(mục)\s([A-Z]|[0-9])+",
                                                title[index_start:])
                                            for fN in findName:
                                                sec_name = title[
                                                    index_start +
                                                    fN.span()[0]:index_start +
                                                    fN.span()[1]]
                                                break
                                        law_name = None
                                        findName = re.finditer(
                                            r"điều [0-9]+\w*",
                                            title[index_start:])
                                        if divlaw.lenIterator(findName) > 0:
                                            findName = re.finditer(
                                                r"điều [0-9]+\w*",
                                                title[index_start:])
                                            for fN in findName:
                                                law_name = title[
                                                    index_start +
                                                    fN.span()[0]:index_start +
                                                    fN.span()[1]]
                                                break
                                        yield [
                                            law_id, part_index, chap_index,
                                            sec_index, law_index, item_index,
                                            point_index, part_name, chap_name,
                                            sec_name, law_name, item['name'],
                                            None, type_modify
                                        ]
                                        continue
                                totalPoint = divlaw.getTotalPoint(
                                    divModify, part_id, chap_id, sec_id,
                                    law_index, item_id)
                                if totalPoint == 0:
                                    totalPoint = 1
                                for point_id in range(0, totalPoint):
                                    point = divlaw.getPoint(
                                        divModify, part_id, chap_id, sec_id,
                                        law_index, item_id, point_id)
                                    if point['name'] != "":
                                        point_name = 'điểm ' + point['name']
                                        if point_name in title:
                                            find_point_name = re.finditer(
                                                r"điểm " + point['name'],
                                                title, re.U)
                                            index_start = getFirst(
                                                find_point_name).end()
                                            part_name = None
                                            findName = re.finditer(
                                                r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                                title[index_start:])
                                            if divlaw.lenIterator(
                                                    findName) > 0:
                                                findName = re.finditer(
                                                    r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                                                    title[index_start:])
                                                for fN in findName:
                                                    part_name = title[
                                                        index_start + fN.span(
                                                        )[0]:index_start +
                                                        fN.span()[1]]
                                                    break
                                            chap_name = None
                                            findName = re.finditer(
                                                r"(chương)\s([A-Z]|[0-9])+",
                                                title[index_start:])
                                            if divlaw.lenIterator(
                                                    findName) > 0:
                                                findName = re.finditer(
                                                    r"(chương)\s([A-Z]|[0-9])+",
                                                    title[index_start:])
                                                for fN in findName:
                                                    chap_name = title[
                                                        index_start + fN.span(
                                                        )[0]:index_start +
                                                        fN.span()[1]]
                                                    break
                                            sec_name = None
                                            findName = re.finditer(
                                                r"(mục)\s([A-Z]|[0-9])+",
                                                title[index_start:])
                                            if divlaw.lenIterator(
                                                    findName) > 0:
                                                findName = re.finditer(
                                                    r"(mục)\s([A-Z]|[0-9])+",
                                                    title[index_start:])
                                                for fN in findName:
                                                    sec_name = title[
                                                        index_start + fN.span(
                                                        )[0]:index_start +
                                                        fN.span()[1]]
                                                    break
                                            law_name = None
                                            findName = re.finditer(
                                                r"điều [0-9]+\w*",
                                                title[index_start:])
                                            if divlaw.lenIterator(
                                                    findName) > 0:
                                                findName = re.finditer(
                                                    r"điều [0-9]+\w*",
                                                    title[index_start:])
                                                for fN in findName:
                                                    law_name = title[
                                                        index_start + fN.span(
                                                        )[0]:index_start +
                                                        fN.span()[1]]
                                                    break
                                            item_name = None
                                            findName = re.finditer(
                                                r"(?:khoản\s)[0-9]+\w*",
                                                title[index_start:])
                                            if divlaw.lenIterator(
                                                    findName) > 0:
                                                findName = re.finditer(
                                                    r"(?:khoản\s)[0-9]+\w*",
                                                    title[index_start:])
                                                for fN in findName:
                                                    item_name = title[
                                                        index_start + 8 + fN.
                                                        span()[0]:index_start +
                                                        fN.span()[1]]
                                                    break
                                            yield [
                                                law_id, part_index, chap_index,
                                                sec_index, law_index,
                                                item_index, point_index,
                                                part_name, chap_name, sec_name,
                                                law_name, item_name,
                                                point['name'], type_modify
                                            ]
                                            continue
        if type_modify == 2:
            start_index = 0
            ft = re.finditer(r"bổ\ssung\s.+(vào).{5}", title, re.U)
            for i in ft:
                start_index = i.end() - 5
                break
            ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}", title, re.U)
            for i in ft:
                start_index = i.end() - 5
                break
            part_name = None
            findName = re.finditer(
                r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                title[start_index:])
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(
                    r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",
                    title[start_index:])
                for fN in findName:
                    part_name = title[start_index + fN.span()[0]:start_index +
                                      fN.span()[1]]
                    break
            chap_name = None
            findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",
                                   title[start_index:])
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",
                                       title[start_index:])
                for fN in findName:
                    chap_name = title[start_index + fN.span()[0]:start_index +
                                      fN.span()[1]]
                    break
            sec_name = None
            findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",
                                   title[start_index:])
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",
                                       title[start_index:])
                for fN in findName:
                    sec_name = title[start_index + fN.span()[0]:start_index +
                                     fN.span()[1]]
                    break
            law_name = None
            findName = re.finditer(r"điều [0-9]+[A-zĐđ]*", title[start_index:])
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",
                                       title[start_index:])
                for fN in findName:
                    law_name = title[start_index + fN.span()[0]:start_index +
                                     fN.span()[1]]
                    break
            item_name = None
            findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:])
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:])
                for fN in findName:
                    item_name = title[start_index + fN.span()[0] +
                                      8:start_index + fN.span()[1]]
                    break
            point_name = None
            temp = title
            findName = re.finditer(r"(điểm\s)[A-z]+", title[start_index:],
                                   re.U)
            if divlaw.lenIterator(findName) > 0:
                findName = re.finditer(r"(điểm\s)[A-zđ]+", temp[start_index:],
                                       re.U)
                for fN in findName:
                    point_name = temp[start_index + fN.span()[0]:start_index +
                                      fN.span()[0]]
                    break
            if 'sau' in title[:start_index]:
                type_modify = 3
            elif 'trước' in title[:start_index]:
                type_modify = 4
            yield [
                law_id, part_index, chap_index, sec_index, law_index,
                item_index, point_index, part_name, chap_name, sec_name,
                law_name, item_name, point_name, type_modify
            ]
Пример #4
0
def extract(
    law_id ="text",
    type_modify =  "int",
    content = "text",
	numerical_symbol = "text",
    position = "text",
    released_date = "text"   
    ):

	titles = getTitle(content)
	if type_modify == 1:
		titles = handle_string.toLowerCase(titles)
		###
		titles = rewriteString(titles)
		if titles is None:
			titles = "None"
			yield [
					law_id ,
					position,
					type_modify,
					"1",
					None,
					None,
					None,
					None,
					None,
					None,
					None,
					None,
					numerical_symbol,
					released_date
					]
		a = divTitle(titles)
		for title in a:
			findType = re.finditer(r"(.+vào.+)|(.+(sau|trước)[^\:]{7,})",title,re.U)
			if divlaw.lenIterator(findType) > 0:
				type_modify = 8
			match = re.finditer(r"(\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content,re.DOTALL)
			quotesIndex = []
			for i in match:
				quotesIndex.append(i.start())
			for j in range(len(quotesIndex)) :
				if type_modify == 1:
					divModify = divlaw.divPartModifyLaw(content)
					if j != (len(quotesIndex) - 1):
						divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:quotesIndex[j+1]])
					else :
						divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:])
					totalPart = divlaw.getTotalPart(divModify)
					if (totalPart == 0):
						totalPart = 1
					for part_id in range(0,totalPart):
						part = divlaw.getPart(divModify,part_id)
						if part['name'] != "":
							part_name = handle_string.toLowerCase(part['name'])
							if part_name in title:
								yield[
									law_id ,
									position,
									type_modify,
									part_name,
									None,
									None,
									None,
									None,
									None,
									None,
									None,
									None,
									numerical_symbol,
									released_date
									]
								continue
						totalChap = divlaw.getTotalChapter(divModify,part_id)
						if totalChap == 0:
							totalChap = 1
						for chap_id in range(0,totalChap):
							chap = divlaw.getChapter(divModify,part_id,chap_id)
							if chap['name'] != "":
								chap_name = handle_string.toLowerCase(chap['name'])
								if chap_name in title:
									part_name = None
									findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
									if divlaw.lenIterator(findName)>0 :
										findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
										for fN in findName:
											part_name = title[fN.span()[0]:fN.span()[1]]
											break
									yield[
									law_id ,
									position,
									type_modify,
									part_name,
								    chap_name,
								    None,
								    None,
								    None,
								    None,
								    None,
								    None,
								    None,
								    numerical_symbol,
								    released_date
									]
									continue
							totalSec = divlaw.getTotalSection(divModify,part_id,chap_id)
							if totalSec == 0:
								totalSec = 1
							for sec_id in range(0,totalSec):
								sec = divlaw.getSection(divModify, part_id, chap_id,sec_id)
								if sec['name'] != "":
									sec_name = handle_string.toLowerCase(sec['name'])
									if sec_name in title:
										part_name = None
										findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
										if divlaw.lenIterator(findName)>0 :
											findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
											for fN in findName:
												part_name = title[fN.span()[0]:fN.span()[1]]
												break
										chap_name = None
										findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title)
										if divlaw.lenIterator(findName)>0 :
											findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title)
											for fN in findName:
												chap['name'] = title[fN.span()[0]:fN.span()[1]]
												break
										yield[
										law_id ,
    									position,
										type_modify,
										part_name,
									    chap_name,
									    sec_name,
									    None,
									    None,
									    None,
									    None,
									    None,
									    None,
										numerical_symbol,
										released_date
										]
										continue
								totalLaw = divlaw.getTotalLaw(divModify,part_id,chap_id,sec_id)
								if totalLaw == 0:
									totalLaw = 1
								for law_index in range(0,totalLaw):
									law = divlaw.getLaw(divModify,part_id,chap_id,sec_id,law_index)
									if law['name'] != "":
										law_name = handle_string.toLowerCase(law['name'])
										if law_name in title:
											part_name = None
											findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
											if divlaw.lenIterator(findName)>0 :
												findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title)
												for fN in findName:
													part_name = title[fN.span()[0]:fN.span()[1]]
													break
											chap_name = None
											findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title)
											if divlaw.lenIterator(findName)>0 :
												findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title)
												for fN in findName:
													chap_name = title[fN.span()[0]:fN.span()[1]]
													break
											sec_name = None
											findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title)
											if divlaw.lenIterator(findName)>0 :
												findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title)
												for fN in findName:
													sec_name = title[fN.span()[0]:fN.span()[1]]
													break
											yield[
											law_id ,
    										position,
											type_modify,
										    part_name,
										    chap_name,
										    sec_name,
										    law_name,
										    None,
										    None,
										    None,
										    None,
										    None,
										    numerical_symbol,
										    released_date
											]
											continue
									totalItem = divlaw.getTotalItem(divModify,part_id,chap_id,sec_id,law_index)
									if totalItem == 0:
										totalItem = 1
									for item_id in range(0,totalItem):
										item = divlaw.getItem(divModify,part_id,chap_id,sec_id,law_index,item_id)
										if item['name'] != "":
											item_name = 'khoản ' + item['name']
											if item_name in title:
												find_item_name = re.finditer(r"khoản\s"+item['name'],title,re.U)
												ex = getFirst(find_item_name)
												index_start = ex.end()
												part_name = None
												findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:])
												if divlaw.lenIterator(findName)>0 :
													findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:])
													for fN in findName:
														part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
														break
												chap_name = None
												findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:])
												if divlaw.lenIterator(findName)>0 :
													findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:])
													for fN in findName:
														chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
														break
												sec_name = None
												findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:])
												if divlaw.lenIterator(findName)>0 :
													findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:])
													for fN in findName:
														sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
														break
												law_name = None
												findName = re.finditer(r"điều [0-9]+\w*",title[index_start:])
												if divlaw.lenIterator(findName)>0 :
													findName = re.finditer(r"điều [0-9]+\w*",title[index_start:])
													for fN in findName:
														law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
														break
												yield[
												law_id ,
    											position,
												type_modify,
											    part_name,
											    chap_name,
											    sec_name,
											    law_name,
											    item['name'],
											    None,
											    None,
											    None,
											    None,
											    numerical_symbol,
											    released_date
												]
												continue
										totalPoint = divlaw.getTotalPoint(divModify,part_id,chap_id,sec_id,law_index,item_id)
										if totalPoint == 0:
											totalPoint = 1
										for point_id in range(0,totalPoint):
											point = divlaw.getPoint(divModify,part_id,chap_id,sec_id,law_index,item_id,point_id)
											if point['name'] != "":
												point_name = 'điểm ' + point['name']
												if point_name in title:
													find_point_name = re.finditer(r"điểm "+point['name'],title,re.U)
													index_start = getFirst(find_point_name).end()
													part_name = None
													findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:])
													if divlaw.lenIterator(findName)>0 :
														findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:])
														for fN in findName:
															part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
															break
													chap_name = None
													findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:])
													if divlaw.lenIterator(findName)>0 :
														findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:])
														for fN in findName:
															chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
															break
													sec_name = None
													findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:])
													if divlaw.lenIterator(findName)>0 :
														findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:])
														for fN in findName:
															sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
															break
													law_name = None
													findName = re.finditer(r"điều [0-9]+\w*",title[index_start:])
													if divlaw.lenIterator(findName)>0 :
														findName = re.finditer(r"điều [0-9]+\w*",title[index_start:])
														for fN in findName:
															law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]]
															break
													item_name = None
													findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:])
													if divlaw.lenIterator(findName)>0 :
														findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:])
														for fN in findName:
															item_name = title[index_start+8+fN.span()[0]:index_start+fN.span()[1]]
															break
													yield[
												    law_id ,
													position,
													type_modify,
												    part_name,
												    chap_name,
												    sec_name,
												    law_name,
												    item_name,
												    point['name'],
												    None,
												    None,
												    None,
												    numerical_symbol,
												    released_date
													]
													continue
				if type_modify == 8:
					start_index = 0
					ft = re.finditer(r"bổ\ssung\s.+(vào).{5}",title,re.U)
					for i in ft :
						start_index = i.end() - 5
						break
					ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}",title,re.U)
					for i in ft :
						start_index = i.end() - 5
						break
					part_name = None
					findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:])
					if divlaw.lenIterator(findName)>0 :
						findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:])
						for fN in findName:
							part_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]]
							break
					chap_name = None
					findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:])
					if divlaw.lenIterator(findName)>0 :
						findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:])
						for fN in findName:
							chap_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]]
							break
					sec_name = None
					findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:])
					if divlaw.lenIterator(findName)>0 :
						findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:])
						for fN in findName:
							sec_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]]
							break
					law_name = None
					findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:])
					if divlaw.lenIterator(findName)>0 :
						findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:])
						for fN in findName:
							law_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]]
							break
					item_name = None
					findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:])
					if divlaw.lenIterator(findName)>0 :
						findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:])
						for fN in findName:
							item_name = title[start_index+fN.span()[0] + 8:start_index+fN.span()[1]]
							break
					point_name = None
					temp = title
					findName = re.finditer(r"(điểm\s)[A-z]+",title[start_index:],re.U)
					if divlaw.lenIterator(findName) > 0 :
						findName = re.finditer(r"(điểm\s)[A-zđ]+",temp[start_index:],re.U)
						for fN in findName:
							point_name = temp[start_index+fN.span()[0]:start_index+fN.span()[0]]
							break
					if 'sau' in title[:start_index]:
						type_modify = 9
					elif 'trước' in title[:start_index]:
						type_modify = 10
					yield[
					    law_id ,
						position,
						type_modify,
					    part_name,
					    chap_name,
					    sec_name,
					    law_name,
					    item_name,
					    point_name,
					    None,
					    None,
					    None,
					    numerical_symbol,
					    released_date
					]

	if type_modify == 2 :
		t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
		extract = t.finditer(content)
		if(lenIterator(extract)>0):
			for extract in t.finditer(content):
				temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]])
				if(temp_law is not None):
					law = temp_law.group()
				else :
					law = None
				temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
			if(temp_item is not None):
				item = temp_item.group()[8:]
			else :
			    item = None
			temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
			if(temp_point is not None):
			    point = temp_point.group()[8:]
			else :
			    point = None
			yield[
			    law_id,
			    position,
				type_modify,
			    None,
			    None,
			    None,
			    law,
			    item,
			    point,
			    None,
			    None,
			    None,
			    numerical_symbol,
			    released_date
			]
		else :
		    yield[
		        law_id,
		        position,
				type_modify,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        numerical_symbol,
		        released_date
		    ]
	if(type_modify == 3 ):
		p =re.compile(r'(B|b)ổ\ssung\s(cụm\s)*từ\s')
		for location in p.finditer(content):
			sub_content = content[location.span()[1]:len(content)]
			temp = p.finditer(sub_content)
			if(lenIterator(temp)>0):
				for temp in p.finditer(sub_content):
					sub_content = sub_content[0:temp.span()[0]]
					break
			temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*sau\s(cụm\s)*từ\s',sub_content)
			if(temp_replace is not None):
				temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group())
				from_replace = temp_from_replace.group()
				temp_replace = re.search(r'sau\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content)
				temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group())
				to_replace = temp_to_replace.group()
				t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
				extract = t.finditer(sub_content,re.DOTALL)
				if(lenIterator(extract)>0):
					for extract in t.finditer(sub_content):
						temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]])
						if(temp_law is not None):
							law = temp_law.group()
						else :
							law = None
						temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
						if(temp_item is not None):
							item = temp_item.group()[8:]
						else :
							item = None
						temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
						if(temp_point is not None):
							point = temp_point.group()[8:]
						else :
							point = None
						yield[
						    law_id,
						    position,
							type_modify,
		                    None,
		                    None,
		                    None,
		                    law,
		                    item,
		                    point,
		                    sub_content,
		                    from_replace,
		                    to_replace,
		                    numerical_symbol,
		                    released_date
		                ]
			else :
				yield[
			        law_id,
			        position,
					type_modify,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        numerical_symbol,
			        released_date
			    ]
	if(type_modify == 4 ):
		p =re.compile(r'((t|T)hay\s)*(cụm\s)*từ\s')
		for location in p.finditer(content):
		sub_content = content[location.span()[1]:len(content)]
			temp = p.finditer(sub_content)
			if(lenIterator(temp)>0):
				for temp in p.finditer(sub_content):
					# sub_content_from : lấy cụm từ cần sửa đổi để tách 
					sub_content_from = sub_content[0:temp.span()[1]]
					break
				temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*(được\s)*(thay\s)*bằng\s(cụm\s)*từ',sub_content_from)
				if(temp_replace is not None):
					temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group())
					from_replace = temp_from_replace.group()
					temp_replace = re.search(r'(được\s)*(thay\s)*bằng\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content)
					if(temp_replace is not None):
						temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group())
						to_replace = temp_to_replace.group()
						t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
						extract = t.finditer(sub_content,re.DOTALL)
						if(lenIterator(extract)>0):
							for extract in t.finditer(sub_content):
								temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]])
								if(temp_law is not None):
									law = temp_law.group()
								else :
									law = None
								temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
								if(temp_item is not None):
									item = temp_item.group()[8:]
								else :
									item = None
								temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
								if(temp_point is not None):
									point = temp_point.group()[8:]
								else :
									point = None
								yield[
		    	                    law_id,
				                    position,
									type_modify,
				                    None,
				                    None,
				                    None,
				                    law,
				                    item,
				                    point,
				                    sub_content,
				                    from_replace,
				                    to_replace,
				                    numerical_symbol,
				                    released_date
								]
			else :
				yield[
	                law_id,
			        position,
					type_modify,
			        content,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        None,
			        numerical_symbol,
			        released_date
				]
	if(type_modify == 7):
		text_delete = re.search(r'(\“|\").+(\”|\")',content,re.M|re.I)
		if(text_delete is not None):
			# numerical_symbol = get_numerical_symbol(content)
			t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
			extract = t.finditer(content)
			if(lenIterator(extract)>0):
				for extract in t.finditer(content):
					temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_chapter is not None):
						chapter = temp_chapter.group()
					else:
						chapter = None
					temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]])
					if(temp_law is not None):
						law = temp_law.group()
					else :
						law = None
					temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_item is not None):
						item = temp_item.group()[8:]
					else :
						item = None
					temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_point is not None):
						point = temp_point.group()[8:]
					else :
						point = None
					yield[
	                    law_id,
	                    position,
						type_modify,
	                    None,
	                    chapter,
	                    None,
	                    law,
	                    item,
	                    point,
	                    text_delete.group(),
	                    None,
	                    None,
	                    numerical_symbol,
	                    released_date
	                ]
		else :
			t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
			extract = t.finditer(content)
			if(lenIterator(extract)>0):
				for extract in t.finditer(content):
					temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_chapter is not None):
						chapter = temp_chapter.group()
					else:
						chapter = None
					temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]])
					if(temp_law is not None):
						law = temp_law.group()
					else :
						law = None
					temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_item is not None):
						item = temp_item.group()[8:]
					else :
						item = None
					temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_point is not None):
						point = temp_point.group()[8:]
					else :
						point = None
					yield[
	                    law_id,
	                    position,
						type_modify,
	                    None,
	                    chapter,
	                    None,
	                    law,
	                    item,
	                    point,
	                    "NA",
	                    None,
	                    None,
	                    numerical_symbol,
	                    released_date
	                ]
			yield[
	            law_id,
		        position,
				type_modify,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        numerical_symbol,
		        released_date
				]
	if(type_modify == 5):
		location = re.search('(t|T)ên của\s.*\sđược\s((s|S)ửa đổi\,\s)*((b|B)ổ sung\s)*',content)
		if(location is not None):
			sub_content = location.group()
			text = re.search('(\"|\").*(\"|\")',content)
			t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
			extract = t.finditer(sub_content)
			if(lenIterator(extract)>0):
				for extract in t.finditer(sub_content):
					temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
					if(temp_chapter is not None):
						chapter = temp_chapter.group()
					else:
						chapter = None
					temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]])
					if(temp_law is not None):
						law = temp_law.group()
					else :
						law = None
					temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
					if(temp_item is not None):
						item = temp_item.group()[8:]
					else :
						item = None
					temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]])
					if(temp_point is not None):
						point = temp_point.group()[8:]
					else :
						point = None
					yield[
	                        law_id,
		                    position,
							type_modify,
		                    None,
		                    chapter,
		                    None,
		                    law,
		                    item,
		                    point,
		                    sub_content,
		                    None,
		                    text.group(),
		                    numerical_symbol,
		                    released_date
	                ]
		else :
			yield[
	            law_id,
	            position,
				type_modify,
	            None,
	            None,
	            None,
	            None,
	            None,
	            None,
	            None,
	            None,
	            None,
	            numerical_symbol,
	            released_date
			]
	if(type_modify == 6):
		text = re.search('(\“|\"|\").*(\”|\"|\")',content)
		if(text is not None):
			t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))')
			extract = t.finditer(content)
			if(lenIterator(extract)>0):
				for extract in t.finditer(content):
					temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_chapter is not None):
						chapter = temp_chapter.group()
					else:
						chapter = None
					temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]])
					if(temp_law is not None):
						law = temp_law.group()
					else :
						law = None
					temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_item is not None):
						item = temp_item.group()[8:]
					else :
						item = None
					temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]])
					if(temp_point is not None):
						point = temp_point.group()[8:]
					else :
						point = None
					yield[
	                    law_id,
	                    position,
						type_modify,
	                    None,
	                    chapter,
	                    None,
	                    law,
	                    item,
	                    point,
	                    None,
	                    None,
	                    text.group(),
	                    numerical_symbol,
	                    released_date
	                ]
		else :
			yield[
	            law_id,
		        position,
				type_modify,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        None,
		        numerical_symbol,
		        released_date
	        ]
Пример #5
0
def extract(mention_id="text",
            sentence_text="text",
            tokens="text[]",
            begin_exp="int",
            end_exp="int",
            begin_explain="int",
            end_explain="int",
            sentence_source="text[]",
            position_source="text[]"):

    forbidden_word = [
        "nếu", "phải", "đó", "không", "được", "đã", "đồng_thời", "cần", "chỉ",
        'cụ_thể', 'ai', 'đây'
    ]
    for i in range(2):
        if begin_explain + i <= end_explain:
            if handle_string.toLowerCase(tokens[begin_explain +
                                                i]) in forbidden_word:
                yield [mention_id, -10, "forbidden_word_1"]
        if end_exp - i >= begin_exp:
            if handle_string.toLowerCase(tokens[end_exp -
                                                i]) in forbidden_word:
                yield [mention_id, -10, "forbidden_word_1"]
    if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word:
        yield [mention_id, -1, "forbidden_word_2"]
    if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu"
                                                in tokens[begin_exp:end_exp]):
        yield [mention_id, -4, "forbidden_word_3"]

    if ("đối_với"
            in tokens[begin_exp:end_exp]) or ("Đối_với"
                                              in tokens[begin_exp:end_exp]):
        yield [mention_id, -4, "forbidden_word_4"]
    if ("trường_hợp"
            in tokens[begin_exp:end_exp]) or ("Trường_hợp"
                                              in tokens[begin_exp:end_exp]):
        yield [mention_id, -4, "forbidden_word_5"]
    #if ('là' in tokens[begin_explain:end_explain]) :
    #yield [
    #mention_id,
    #-4,
    #"forbidden_word_6"
    #]
    i = len(mention_id) - 1
    first = False
    while (i > 0):
        if mention_id[i] == '_' and not first:
            first = True
            i -= 1
            continue
        if mention_id[i] == '_' and first:
            break
        i -= 1
    j = 0
    while (j < len(mention_id)):
        if mention_id[j] == '_':
            break
        j += 1
    position_require = mention_id[j + 1:i + 1]
    index = 0
    explain_text = " ".join(
        map(lambda i: tokens[i], xrange(begin_exp, end_exp + 1)))
    if len(explain_text) < 60:
        for index in range(0, len(position_source)):
            if position_require in position_source[index]:
                temp_sen = handle_string.toLowerCase(sentence_source[index])
                if divlaw.lenIterator(
                        re.finditer(r"giải(\s|\_)thích(\s|\_)từ(\s|\_)ngữ",
                                    sentence_source[index], re.U | re.I)) > 0:
                    yield [mention_id, 1, "in_explain_words_law"]
Пример #6
0
def extract(doc_id="text",
            position="text",
            modify_title="text",
            released_date="text",
            doc_id_resources="text[]",
            doc_title_resources="text[]",
            doc_symbol_resources="text[]",
            type_doc="text[]",
            released_date_resources="text[]"):

    released_date_temp = None
    if released_date:
        try:
            released_date_temp = time.strptime(released_date, "%Y-%m-%d")
        except:
            print >> sys.stderr, doc_id, position, released_date
            return
    pattern = re.compile(r"[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+")
    m = pattern.match(modify_title)
    if (m is not None):
        symbol = m.group(0)
        symbol = handle_string.toLowerCase(symbol)
        available = False
        for i in range(0, len(doc_symbol_resources)):
            try:
                released_date_temp2 = time.strptime(released_date_resources[i],
                                                    "%Y-%m-%d")
            except:
                print >> sys.stderr, doc_id, position, released_date_resources[
                    i]
                continue
            if handle_string.toLowerCase(
                    doc_symbol_resources[i]) == symbol and (
                        released_date_temp2 == released_date_temp
                        or released_date is None):
                available = True
                yield [
                    doc_id,
                    position,
                    doc_id_resources[i],
                ]
                return
        if not available:
            yield [
                doc_id,
                position,
                "NA",
            ]
            return
    else:
        available = False
        tempReal = handle_string.to_unicode(modify_title)
        tempReal = handle_string.toLowerCase(tempReal)
        for i in range(0, len(doc_title_resources)):
            temp = type_doc[i] + " " + doc_title_resources[i]
            tempU = handle_string.to_unicode(temp)
            tempU = handle_string.toLowerCase(tempU)
            try:
                released_date_temp2 = time.strptime(released_date_resources[i],
                                                    "%Y-%m-%d")
            except:
                print >> sys.stderr, doc_id, position, released_date_resources[
                    i]
            if tempU.strip() == tempReal.strip() and (
                    released_date_temp2 == released_date_temp
                    or released_date is None):
                available == True
                yield [doc_id, position, doc_id_resources[i]]
                return
                break
        if available == False:
            yield [doc_id, position, "NA"]
Пример #7
0
def extract(
    mention_id ="text",
    sentence_text ="text",
    tokens ="text[]",
    begin_exp ="int",
    end_exp ="int",
    begin_explain ="int",
    end_explain ="int",
    sentence_source ="text[]",
    position_source ="text[]"
    ):
	
	forbidden_word = ["nếu","phải","đó","không","được","đã","đồng_thời","cần", "chỉ",'cụ_thể'] 
	for i in range(2):
		if end_exp +2 +i <= end_explain:
			if handle_string.toLowerCase(tokens[end_exp+2+i]) in forbidden_word:
				yield [
				mention_id,
				-10,
				"forbidden_word_1"
				]
		if end_exp - i >= begin_exp:
			if handle_string.toLowerCase(tokens[end_exp-i]) in forbidden_word:
				yield [
				mention_id,
				-10,
				"forbidden_word_1"
				]
	if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word:
		yield [
		mention_id,
		-1,
		"forbidden_word_2"
		]
	if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]):
		yield [
		mention_id,
		-1,
		"forbidden_word_3"
		]
	i = len(mention_id) - 1
	first = False
	while(i>0) :
		if mention_id[i] == '_' and not first:
			first = True
			i -= 1
			continue
		if mention_id[i] == '_' and first:
			break
		i -= 1
	j = 0
	while(j<len(mention_id)) :
		if mention_id[j] == '_':
			break
		j += 1
	position_require = mention_id[j+1:i+1]
	index = 0
	for index in range(0,len(position_source)):
		if position_require in position_source[index] :
			if divlaw.lenIterator(re.finditer(r"Giải_thích\stừ_ngữ",sentence_source[index],re.U|re.I)) > 0 :
				yield [
					mention_id,
					1,
					"in_explain_words_law"
				]