def is_candidate(tokens, index): checking_text = " ".join( map(lambda i: tokens[i], xrange(index + 1, len(tokens)))) for i in range(0, index): for j in str.split(tokens[i], "_"): if j is not None and j.isalpha(): if handle_string.toLowerCase(j) in checking_text: return True return False
def extract(doc_id="text", header_text="text", title="text"): temp = "" if re.search(r'(sửa đổi|bổ sung)', handle_string.toLowerCase(title), re.U): check_symbol = re.search( r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', title, re.U | re.I) if check_symbol is not None: yield [ doc_id, (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+', check_symbol.group(), re.U | re.I)).group(), findDate(title[check_symbol.end(0):]) ] else: get_content = re.finditer( re.escape(handle_string.toUpperCase(title.strip())) + r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', handle_string.toUpperCase(header_text), re.U | re.I) if divlaw.lenIterator(get_content) > 0: get_content = re.finditer( re.escape(handle_string.toUpperCase(title.strip())) + r'\s(SỐ\s)*[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+(\s|\_|\#|\*|\.|\\)', handle_string.toUpperCase(header_text), re.U | re.I) for i in get_content: break yield [ doc_id, (re.search(r'[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+', i.group(0), re.U | re.I)).group(0), findDate(title[i.end():]) ] else: getTitleModified = re.finditer(r'của\s', title, re.U | re.I) if divlaw.lenIterator(getTitleModified) > 0: getTitleModified = re.finditer(r'của\s', title, re.U | re.I) for i in getTitleModified: break temp = title[i.end():] get_content = re.finditer( re.escape(handle_string.toUpperCase(title)), handle_string.toUpperCase(header_text), re.U | re.I) if divlaw.lenIterator(get_content) > 0: get_content = re.finditer( re.escape(handle_string.toUpperCase(title)), handle_string.toUpperCase(header_text), re.U | re.I) for i in get_content: pass yield [doc_id, temp, findDate(header_text[i.end():])] else: yield [doc_id, temp, None]
def extract(law_id="text", part_index="int", chap_index="int", sec_index="int", law_index="int", item_index="int", point_index="int", numerical_symbol="text", titles="text", content="text", location_content="int", count="int"): titles = handle_string.toLowerCase(titles) titles = rewriteString(titles) a = divTitle(titles) for title in a: findType = re.finditer(r"(.+vào.+)|(.+(sau|trước).{7,})", title, re.U) if divlaw.lenIterator(findType) > 0: type_modify = 2 else: type_modify = 1 match = re.finditer( r"(\\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content, re.DOTALL) quotesIndex = [] for i in match: quotesIndex.append(i.start()) for j in range(len(quotesIndex)): if type_modify == 1: divModify = divlaw.divPartModifyLaw(content) if j != (len(quotesIndex) - 1): divModify = divlaw.divPartModifyLaw( content[quotesIndex[j]:quotesIndex[j + 1]]) else: divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:]) totalPart = divlaw.getTotalPart(divModify) if (totalPart == 0): totalPart = 1 for part_id in range(0, totalPart): part = divlaw.getPart(divModify, part_id) if part['name'] != "": part_name = handle_string.toLowerCase(part['name']) if part_name in title: yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, None, None, None, None, None, type_modify ] continue totalChap = divlaw.getTotalChapter(divModify, part_id) if totalChap == 0: totalChap = 1 for chap_id in range(0, totalChap): chap = divlaw.getChapter(divModify, part_id, chap_id) if chap['name'] != "": chap_name = handle_string.toLowerCase(chap['name']) if chap_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN.span( )[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, None, None, None, None, type_modify ] continue totalSec = divlaw.getTotalSection(divModify, part_id, chap_id) if totalSec == 0: totalSec = 1 for sec_id in range(0, totalSec): sec = divlaw.getSection(divModify, part_id, chap_id, sec_id) if sec['name'] != "": sec_name = handle_string.toLowerCase(sec['name']) if sec_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN.span( )[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) for fN in findName: chap['name'] = title[fN.span()[0]:fN. span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, None, None, None, type_modify ] continue totalLaw = divlaw.getTotalLaw(divModify, part_id, chap_id, sec_id) if totalLaw == 0: totalLaw = 1 for law_index in range(0, totalSec): law = divlaw.getLaw(divModify, part_id, chap_id, sec_id, law_index) if law['name'] != "": law_name = handle_string.toLowerCase( law['name']) if law_name in title: part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title) for fN in findName: part_name = title[fN.span()[0]:fN. span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title) for fN in findName: chap_name = title[fN.span()[0]:fN. span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title) for fN in findName: sec_name = title[fN.span()[0]:fN. span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, None, None, type_modify ] continue totalItem = divlaw.getTotalItem( divModify, part_id, chap_id, sec_id, law_index) if totalItem == 0: totalItem = 1 for item_id in range(0, totalItem): item = divlaw.getItem(divModify, part_id, chap_id, sec_id, law_index, item_id) if item['name'] != "": item_name = 'khoản ' + item['name'] if item_name in title: find_item_name = re.finditer( r"khoản\s" + item['name'], title, re.U) ex = getFirst(find_item_name) index_start = ex.end() part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) for fN in findName: part_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: chap_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: sec_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break law_name = None findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) for fN in findName: law_name = title[ index_start + fN.span()[0]:index_start + fN.span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item['name'], None, type_modify ] continue totalPoint = divlaw.getTotalPoint( divModify, part_id, chap_id, sec_id, law_index, item_id) if totalPoint == 0: totalPoint = 1 for point_id in range(0, totalPoint): point = divlaw.getPoint( divModify, part_id, chap_id, sec_id, law_index, item_id, point_id) if point['name'] != "": point_name = 'điểm ' + point['name'] if point_name in title: find_point_name = re.finditer( r"điểm " + point['name'], title, re.U) index_start = getFirst( find_point_name).end() part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[index_start:]) for fN in findName: part_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break chap_name = None findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(chương)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: chap_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break sec_name = None findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(mục)\s([A-Z]|[0-9])+", title[index_start:]) for fN in findName: sec_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break law_name = None findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"điều [0-9]+\w*", title[index_start:]) for fN in findName: law_name = title[ index_start + fN.span( )[0]:index_start + fN.span()[1]] break item_name = None findName = re.finditer( r"(?:khoản\s)[0-9]+\w*", title[index_start:]) if divlaw.lenIterator( findName) > 0: findName = re.finditer( r"(?:khoản\s)[0-9]+\w*", title[index_start:]) for fN in findName: item_name = title[ index_start + 8 + fN. span()[0]:index_start + fN.span()[1]] break yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item_name, point['name'], type_modify ] continue if type_modify == 2: start_index = 0 ft = re.finditer(r"bổ\ssung\s.+(vào).{5}", title, re.U) for i in ft: start_index = i.end() - 5 break ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}", title, re.U) for i in ft: start_index = i.end() - 5 break part_name = None findName = re.finditer( r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer( r"phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+", title[start_index:]) for fN in findName: part_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+", title[start_index:]) for fN in findName: chap_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+", title[start_index:]) for fN in findName: sec_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+[A-zĐđ]*", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"điều [0-9]+[A-zĐđ]*", title[start_index:]) for fN in findName: law_name = title[start_index + fN.span()[0]:start_index + fN.span()[1]] break item_name = None findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:]) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(khoản\s)[0-9]+", title[start_index:]) for fN in findName: item_name = title[start_index + fN.span()[0] + 8:start_index + fN.span()[1]] break point_name = None temp = title findName = re.finditer(r"(điểm\s)[A-z]+", title[start_index:], re.U) if divlaw.lenIterator(findName) > 0: findName = re.finditer(r"(điểm\s)[A-zđ]+", temp[start_index:], re.U) for fN in findName: point_name = temp[start_index + fN.span()[0]:start_index + fN.span()[0]] break if 'sau' in title[:start_index]: type_modify = 3 elif 'trước' in title[:start_index]: type_modify = 4 yield [ law_id, part_index, chap_index, sec_index, law_index, item_index, point_index, part_name, chap_name, sec_name, law_name, item_name, point_name, type_modify ]
def extract( law_id ="text", type_modify = "int", content = "text", numerical_symbol = "text", position = "text", released_date = "text" ): titles = getTitle(content) if type_modify == 1: titles = handle_string.toLowerCase(titles) ### titles = rewriteString(titles) if titles is None: titles = "None" yield [ law_id , position, type_modify, "1", None, None, None, None, None, None, None, None, numerical_symbol, released_date ] a = divTitle(titles) for title in a: findType = re.finditer(r"(.+vào.+)|(.+(sau|trước)[^\:]{7,})",title,re.U) if divlaw.lenIterator(findType) > 0: type_modify = 8 match = re.finditer(r"(\n(\s|\_|\.|\*|\#)*\“(.(?!\“|\”))+.{2})|(\n(\s|\_|\.|\*|\#)*\"(.(?!\"))+.{2})", content,re.DOTALL) quotesIndex = [] for i in match: quotesIndex.append(i.start()) for j in range(len(quotesIndex)) : if type_modify == 1: divModify = divlaw.divPartModifyLaw(content) if j != (len(quotesIndex) - 1): divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:quotesIndex[j+1]]) else : divModify = divlaw.divPartModifyLaw(content[quotesIndex[j]:]) totalPart = divlaw.getTotalPart(divModify) if (totalPart == 0): totalPart = 1 for part_id in range(0,totalPart): part = divlaw.getPart(divModify,part_id) if part['name'] != "": part_name = handle_string.toLowerCase(part['name']) if part_name in title: yield[ law_id , position, type_modify, part_name, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalChap = divlaw.getTotalChapter(divModify,part_id) if totalChap == 0: totalChap = 1 for chap_id in range(0,totalChap): chap = divlaw.getChapter(divModify,part_id,chap_id) if chap['name'] != "": chap_name = handle_string.toLowerCase(chap['name']) if chap_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, None, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalSec = divlaw.getTotalSection(divModify,part_id,chap_id) if totalSec == 0: totalSec = 1 for sec_id in range(0,totalSec): sec = divlaw.getSection(divModify, part_id, chap_id,sec_id) if sec['name'] != "": sec_name = handle_string.toLowerCase(sec['name']) if sec_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) for fN in findName: chap['name'] = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, None, None, None, None, None, None, numerical_symbol, released_date ] continue totalLaw = divlaw.getTotalLaw(divModify,part_id,chap_id,sec_id) if totalLaw == 0: totalLaw = 1 for law_index in range(0,totalLaw): law = divlaw.getLaw(divModify,part_id,chap_id,sec_id,law_index) if law['name'] != "": law_name = handle_string.toLowerCase(law['name']) if law_name in title: part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title) for fN in findName: part_name = title[fN.span()[0]:fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title) for fN in findName: chap_name = title[fN.span()[0]:fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title) for fN in findName: sec_name = title[fN.span()[0]:fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, None, None, None, None, None, numerical_symbol, released_date ] continue totalItem = divlaw.getTotalItem(divModify,part_id,chap_id,sec_id,law_index) if totalItem == 0: totalItem = 1 for item_id in range(0,totalItem): item = divlaw.getItem(divModify,part_id,chap_id,sec_id,law_index,item_id) if item['name'] != "": item_name = 'khoản ' + item['name'] if item_name in title: find_item_name = re.finditer(r"khoản\s"+item['name'],title,re.U) ex = getFirst(find_item_name) index_start = ex.end() part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) for fN in findName: part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) for fN in findName: law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item['name'], None, None, None, None, numerical_symbol, released_date ] continue totalPoint = divlaw.getTotalPoint(divModify,part_id,chap_id,sec_id,law_index,item_id) if totalPoint == 0: totalPoint = 1 for point_id in range(0,totalPoint): point = divlaw.getPoint(divModify,part_id,chap_id,sec_id,law_index,item_id,point_id) if point['name'] != "": point_name = 'điểm ' + point['name'] if point_name in title: find_point_name = re.finditer(r"điểm "+point['name'],title,re.U) index_start = getFirst(find_point_name).end() part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[index_start:]) for fN in findName: part_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: chap_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[index_start:]) for fN in findName: sec_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+\w*",title[index_start:]) for fN in findName: law_name = title[index_start+fN.span()[0]:index_start+fN.span()[1]] break item_name = None findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(?:khoản\s)[0-9]+\w*",title[index_start:]) for fN in findName: item_name = title[index_start+8+fN.span()[0]:index_start+fN.span()[1]] break yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item_name, point['name'], None, None, None, numerical_symbol, released_date ] continue if type_modify == 8: start_index = 0 ft = re.finditer(r"bổ\ssung\s.+(vào).{5}",title,re.U) for i in ft : start_index = i.end() - 5 break ft = re.finditer(r"bổ\ssung\s.+(sau|trước).{5}",title,re.U) for i in ft : start_index = i.end() - 5 break part_name = None findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(phần thứ)\s([A-z]|À|Á|Â|Ã|È|É|Ê|Ì|Í|Ò|Ó|Ô|Õ|Ù|Ú|Ă|Đ|Ĩ|Ũ|Ơ|à|á|â|ã|è|é|ê|ì|í|ò|ó|ô|õ|ù|ú|ă|đ|ĩ|ũ|ơ|Ư|Ă|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ|Ẹ|Ẻ|Ẽ|Ề|Ề|Ể|ư|ă|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẹ|ẻ|ẽ|ề|ế|ể|Ễ|Ệ|Ỉ|Ị|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ|Ụ|Ủ|Ứ|Ừ|ễ|ệ|ỉ|ị|ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ụ|ủ|ứ|ừ|Ử|Ữ|Ự|Ỳ|Ỵ|Ý|Ỷ|Ỹ|ử|ữ|ự|ỳ|ỵ|ỷ|ỹ)+",title[start_index:]) for fN in findName: part_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break chap_name = None findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(chương)\s([A-Z]|[0-9])+",title[start_index:]) for fN in findName: chap_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break sec_name = None findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(mục)\s([A-Z]|[0-9])+",title[start_index:]) for fN in findName: sec_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break law_name = None findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"điều [0-9]+[A-zĐđ]*",title[start_index:]) for fN in findName: law_name = title[start_index+fN.span()[0]:start_index+fN.span()[1]] break item_name = None findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:]) if divlaw.lenIterator(findName)>0 : findName = re.finditer(r"(khoản\s)[0-9]+",title[start_index:]) for fN in findName: item_name = title[start_index+fN.span()[0] + 8:start_index+fN.span()[1]] break point_name = None temp = title findName = re.finditer(r"(điểm\s)[A-z]+",title[start_index:],re.U) if divlaw.lenIterator(findName) > 0 : findName = re.finditer(r"(điểm\s)[A-zđ]+",temp[start_index:],re.U) for fN in findName: point_name = temp[start_index+fN.span()[0]:start_index+fN.span()[0]] break if 'sau' in title[:start_index]: type_modify = 9 elif 'trước' in title[:start_index]: type_modify = 10 yield[ law_id , position, type_modify, part_name, chap_name, sec_name, law_name, item_name, point_name, None, None, None, numerical_symbol, released_date ] if type_modify == 2 : t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, None, None, None, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 3 ): p =re.compile(r'(B|b)ổ\ssung\s(cụm\s)*từ\s') for location in p.finditer(content): sub_content = content[location.span()[1]:len(content)] temp = p.finditer(sub_content) if(lenIterator(temp)>0): for temp in p.finditer(sub_content): sub_content = sub_content[0:temp.span()[0]] break temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*sau\s(cụm\s)*từ\s',sub_content) if(temp_replace is not None): temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) from_replace = temp_from_replace.group() temp_replace = re.search(r'sau\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content) temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) to_replace = temp_to_replace.group() t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content,re.DOTALL) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, sub_content, from_replace, to_replace, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 4 ): p =re.compile(r'((t|T)hay\s)*(cụm\s)*từ\s') for location in p.finditer(content): sub_content = content[location.span()[1]:len(content)] temp = p.finditer(sub_content) if(lenIterator(temp)>0): for temp in p.finditer(sub_content): # sub_content_from : lấy cụm từ cần sửa đổi để tách sub_content_from = sub_content[0:temp.span()[1]] break temp_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")\s.*(được\s)*(thay\s)*bằng\s(cụm\s)*từ',sub_content_from) if(temp_replace is not None): temp_from_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) from_replace = temp_from_replace.group() temp_replace = re.search(r'(được\s)*(thay\s)*bằng\s(cụm\s)*từ\s(\“|\")(\s)*.+(\s)*(\”|\")',sub_content) if(temp_replace is not None): temp_to_replace = re.search(r'(\“|\")(\s)*.+(\s)*(\”|\")',temp_replace.group()) to_replace = temp_to_replace.group() t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content,re.DOTALL) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, None, None, law, item, point, sub_content, from_replace, to_replace, numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, content, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 7): text_delete = re.search(r'(\“|\").+(\”|\")',content,re.M|re.I) if(text_delete is not None): # numerical_symbol = get_numerical_symbol(content) t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, text_delete.group(), None, None, numerical_symbol, released_date ] else : t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, "NA", None, None, numerical_symbol, released_date ] yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 5): location = re.search('(t|T)ên của\s.*\sđược\s((s|S)ửa đổi\,\s)*((b|B)ổ sung\s)*',content) if(location is not None): sub_content = location.group() text = re.search('(\"|\").*(\"|\")',content) t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(sub_content) if(lenIterator(extract)>0): for extract in t.finditer(sub_content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',sub_content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, sub_content, None, text.group(), numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ] if(type_modify == 6): text = re.search('(\“|\"|\").*(\”|\"|\")',content) if(text is not None): t = re.compile(r'(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))\s(c|C)hương\s(\w{1,10}|\d{1,5})\s|(c|C)hương\s(\w{1,10}|\d{1,5})\s|(Đ|đ)iểm\s(\w{1,5}|\d{1,5})\s(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(k|K)hoản\s(\w{1,5}|\d{1,5})\s(Đ|đ)iều\s((\w{1,5})|(\d{1,5}))|(đ|Đ)iều\s((\w{1,5})|(\d{1,5}))') extract = t.finditer(content) if(lenIterator(extract)>0): for extract in t.finditer(content): temp_chapter = re.search(r'(c|C)hương\s(\w{1,10}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_chapter is not None): chapter = temp_chapter.group() else: chapter = None temp_law = re.search(r'(đ|Đ)iều\s((\d{1,5})([a-zđ]|[A-Z])?)',content[extract.span()[0]:extract.span()[1]]) if(temp_law is not None): law = temp_law.group() else : law = None temp_item = re.search(r'(Khoản|khoản)\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_item is not None): item = temp_item.group()[8:] else : item = None temp_point = re.search(r'(đ|Đ)iểm\s(\w{1,5}|\d{1,5})',content[extract.span()[0]:extract.span()[1]]) if(temp_point is not None): point = temp_point.group()[8:] else : point = None yield[ law_id, position, type_modify, None, chapter, None, law, item, point, None, None, text.group(), numerical_symbol, released_date ] else : yield[ law_id, position, type_modify, None, None, None, None, None, None, None, None, None, numerical_symbol, released_date ]
def extract(mention_id="text", sentence_text="text", tokens="text[]", begin_exp="int", end_exp="int", begin_explain="int", end_explain="int", sentence_source="text[]", position_source="text[]"): forbidden_word = [ "nếu", "phải", "đó", "không", "được", "đã", "đồng_thời", "cần", "chỉ", 'cụ_thể', 'ai', 'đây' ] for i in range(2): if begin_explain + i <= end_explain: if handle_string.toLowerCase(tokens[begin_explain + i]) in forbidden_word: yield [mention_id, -10, "forbidden_word_1"] if end_exp - i >= begin_exp: if handle_string.toLowerCase(tokens[end_exp - i]) in forbidden_word: yield [mention_id, -10, "forbidden_word_1"] if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word: yield [mention_id, -1, "forbidden_word_2"] if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_3"] if ("đối_với" in tokens[begin_exp:end_exp]) or ("Đối_với" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_4"] if ("trường_hợp" in tokens[begin_exp:end_exp]) or ("Trường_hợp" in tokens[begin_exp:end_exp]): yield [mention_id, -4, "forbidden_word_5"] #if ('là' in tokens[begin_explain:end_explain]) : #yield [ #mention_id, #-4, #"forbidden_word_6" #] i = len(mention_id) - 1 first = False while (i > 0): if mention_id[i] == '_' and not first: first = True i -= 1 continue if mention_id[i] == '_' and first: break i -= 1 j = 0 while (j < len(mention_id)): if mention_id[j] == '_': break j += 1 position_require = mention_id[j + 1:i + 1] index = 0 explain_text = " ".join( map(lambda i: tokens[i], xrange(begin_exp, end_exp + 1))) if len(explain_text) < 60: for index in range(0, len(position_source)): if position_require in position_source[index]: temp_sen = handle_string.toLowerCase(sentence_source[index]) if divlaw.lenIterator( re.finditer(r"giải(\s|\_)thích(\s|\_)từ(\s|\_)ngữ", sentence_source[index], re.U | re.I)) > 0: yield [mention_id, 1, "in_explain_words_law"]
def extract(doc_id="text", position="text", modify_title="text", released_date="text", doc_id_resources="text[]", doc_title_resources="text[]", doc_symbol_resources="text[]", type_doc="text[]", released_date_resources="text[]"): released_date_temp = None if released_date: try: released_date_temp = time.strptime(released_date, "%Y-%m-%d") except: print >> sys.stderr, doc_id, position, released_date return pattern = re.compile(r"[0-9]+(/[0-9]+)*((/|-)[A-ZĐƯ]+[0-9]*)+") m = pattern.match(modify_title) if (m is not None): symbol = m.group(0) symbol = handle_string.toLowerCase(symbol) available = False for i in range(0, len(doc_symbol_resources)): try: released_date_temp2 = time.strptime(released_date_resources[i], "%Y-%m-%d") except: print >> sys.stderr, doc_id, position, released_date_resources[ i] continue if handle_string.toLowerCase( doc_symbol_resources[i]) == symbol and ( released_date_temp2 == released_date_temp or released_date is None): available = True yield [ doc_id, position, doc_id_resources[i], ] return if not available: yield [ doc_id, position, "NA", ] return else: available = False tempReal = handle_string.to_unicode(modify_title) tempReal = handle_string.toLowerCase(tempReal) for i in range(0, len(doc_title_resources)): temp = type_doc[i] + " " + doc_title_resources[i] tempU = handle_string.to_unicode(temp) tempU = handle_string.toLowerCase(tempU) try: released_date_temp2 = time.strptime(released_date_resources[i], "%Y-%m-%d") except: print >> sys.stderr, doc_id, position, released_date_resources[ i] if tempU.strip() == tempReal.strip() and ( released_date_temp2 == released_date_temp or released_date is None): available == True yield [doc_id, position, doc_id_resources[i]] return break if available == False: yield [doc_id, position, "NA"]
def extract( mention_id ="text", sentence_text ="text", tokens ="text[]", begin_exp ="int", end_exp ="int", begin_explain ="int", end_explain ="int", sentence_source ="text[]", position_source ="text[]" ): forbidden_word = ["nếu","phải","đó","không","được","đã","đồng_thời","cần", "chỉ",'cụ_thể'] for i in range(2): if end_exp +2 +i <= end_explain: if handle_string.toLowerCase(tokens[end_exp+2+i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if end_exp - i >= begin_exp: if handle_string.toLowerCase(tokens[end_exp-i]) in forbidden_word: yield [ mention_id, -10, "forbidden_word_1" ] if handle_string.toLowerCase(tokens[end_exp]) in forbidden_word: yield [ mention_id, -1, "forbidden_word_2" ] if ("nếu" in tokens[begin_exp:end_exp]) or ("Nếu" in tokens[begin_exp:end_exp]): yield [ mention_id, -1, "forbidden_word_3" ] i = len(mention_id) - 1 first = False while(i>0) : if mention_id[i] == '_' and not first: first = True i -= 1 continue if mention_id[i] == '_' and first: break i -= 1 j = 0 while(j<len(mention_id)) : if mention_id[j] == '_': break j += 1 position_require = mention_id[j+1:i+1] index = 0 for index in range(0,len(position_source)): if position_require in position_source[index] : if divlaw.lenIterator(re.finditer(r"Giải_thích\stừ_ngữ",sentence_source[index],re.U|re.I)) > 0 : yield [ mention_id, 1, "in_explain_words_law" ]