def extract_people(content): dictionary = {} valuesnewold = (["上百", "100"], ["上千", "1000"], ["上万", "10000"]) listo = ("上百", "上千", "上万") for value in valuesnewold: dictionary[value[0]] = value[1] term = [] text11 = None word = None try: text = re.sub("\n", "", content) #text = re.search(".{6}十|百|千|万[.{0,2}业主|.{0,2}司机|.{0,2}名|.{0,2}开发商|.{0,2}投资者|\ #.{0,2}朋友|.{0,2}民众|.{0,2}父母|.{0,2}老|.{0,3}学生|.{0,3}家|.{0,3}工[友|人|民]|.{0,3}师|\ #.{0,3}户|.{0,4}人|.{0,4}民|多家]", text) text1 = re.search( "[1|2|3|4|5|6|7|8|9]\d{0,1}0{0,6}[民|名|户|人|业主|司机|开发商|投资者|朋友|民众|父母|老|学生|家|工友|工人|工民|师].{5}", text) text1 = re.sub("\d{1,10}人民警察", "", text1.group(0)) text1 = re.sub( ".{0,2}岁|.{0,4}人口|.{0,4}元|.{0,4}人民币|.{0,4}金|.{0,4}工程|.{0,4}块|.{0,4}钱|.{0,4}工资", "", text1) text11 = re.search("\d{2,8}", text1) text11 = text11.group(0) except Exception: pass try: text2 = re.search( "[一|二|两|三|四|五|六|七|八|九|上][十|百|千|万].{0,4}[民|名|户|人|业主|司机|开发商|投资者|朋友|民众|父母|老|学生|家|工友|工人|工民|师]", text) text2 = re.sub( ".{0,2}岁|.{0,4}人口|.{0,4}元|.{0,4}人民币|.{0,4}金|.{0,4}工程|.{0,4}块|.{0,4}钱|.{0,4}工资", "", text2.group(0)) word = cn2num(text2) for element in listo: if element in text2: word = dictionary[element] word = re.search("\d{2,8}", word) word = word.group(0) except Exception: pass if word != None and int(word) <= 999999: term.append(int(word)) if text11 != None and int(text11) <= 999999: term.append(int(text11)) if term != []: term = max(term) else: term = "NA" return term
def dateList(self, fileno): # node is a Beautifulsoup tagset object COLLECTION = [] soup = BeautifulSoup(self.tag(fileno), 'lxml') TEs = soup.find_all('time') #print(TEs) for te in TEs: (era, year, month, day) = list(regex_date2.findall(te.text)[0]) # comps = components (year0, month0) = ('', '') if year.startswith('元年'): # year year0 = 1 elif year == '': # empty string year0 = '' elif year[-1] in '春夏秋冬': year0 = cn2num(year[:-2]) # remove r'年[春夏秋冬]' pos = '春夏秋冬'.index(year[-1]) month0 = 3 * ( pos + 1) - 1 # turns 春 into 2, 夏 into 5, 秋 into 8, 冬 into 11 else: year0 = cn2num(year[:-1]) #print(f"(era, year, month, day) = ({era}, {year}, {month}, {day})") #assert year != '' if month != '': # month if month.startswith('閏'): month_num_zh = month[1:-1] if month_num_zh in ['正', '一']: month0 = 1 else: month0 = cn2num( month_num_zh) + 0.5 # remove 閏 and 月 first elif month.startswith('正'): # 正月 is the same as 一月 month0 = 1 else: month0 = cn2num(month) elif month == '': # day #month0 = '' pass # nothing to do if day.endswith('日'): day = cn2num(day[:-1]) COLLECTION.append([era, year0, month0, day]) return (COLLECTION)
def new_cn2arab(query): if query.isdigit(): return float(query) if len(query) == 0: return query result = [] numstring = [] for i in range(len(query)): char = query[i] if char not in digit_list: if len(numstring) > 0: numstring = ''.join([str(num) for num in numstring]) result.append(pycnnum.cn2num(numstring)) numstring = [] result.append(char) else: if char == '点': try: pre = query[i - 1] post = query[i + 1] if pre in digit_list and post in digit_list: numstring.append(char) else: result.append(char) continue except: continue # if char in convert_list: # char = convert_list[char] if i < len(query) - 1: test = char + query[i + 1] if test in skip_gram: result.append(char) continue numstring.append(char) if len(numstring) > 0: numstring = ''.join([str(num) for num in numstring]) result.append(pycnnum.cn2num(numstring)) result = [str(r) for r in result] return "".join(result)
def ToNum(s): s = s.replace(',','') try: result = int(float(s)) return result except: pass try: result = pycnnum.cn2num(s) return result except Exception as e: raise e
def get_number_from_word_zh(word, improve_number_extraction): punctuation = string.punctuation.replace('-', '') word = word.strip(punctuation) word = word.replace(",", "") try: number = word_to_num(word) except ValueError: try: number = int(word) except ValueError: try: number = float(word) except ValueError: try: if isAllNumber(word): number = pycnnum.cn2num(word) else: return None except ValueError: if improve_number_extraction: if re.match('^\d*1st$', word): # ending in '1st' number = int(word[:-2]) elif re.match('^\d*2nd$', word): # ending in '2nd' number = int(word[:-2]) elif re.match('^\d*3rd$', word): # ending in '3rd' number = int(word[:-2]) elif re.match('^\d+th$', word): # ending in <digits>th # Many occurrences are when referring to centuries (e.g "the *19th* century") number = int(word[:-2]) elif len(word) > 1 and word[-2] == '0' and re.match('^\d+s$', word): # Decades, e.g. "1960s". # Other sequences of digits ending with s (there are 39 of these in the training # set), do not seem to be arithmetically related, as they are usually proper # names, like model numbers. number = int(word[:-1]) elif len(word) > 4 and re.match('^\d+(\.?\d+)?/km[²2]$', word): # per square kilometer, e.g "73/km²" or "3057.4/km2" if '.' in word: number = float(word[:-4]) else: number = int(word[:-4]) elif len(word) > 6 and re.match('^\d+(\.?\d+)?/month$', word): # per month, e.g "1050.95/month" if '.' in word: number = float(word[:-6]) else: number = int(word[:-6]) else: return None else: return None return number
def _text_to_digit(self, word): try: if word == '〇' or word == '零': return '0' num = cn2num(word) if num != 0: return str(num) except: pass try: return self.t2d.convert(word.lower()) except: return None
def convert_index(content): match = num_re.search(content) if match: result = match.groupdict() return int(result["num"]) match = cnnum_re.search(content) if match: result = match.groupdict() content = result["num"] try: return cn2num(content) except: return 0
def IsNum(s): if not s: return False try: s = s.replace(',', '') float(s) return True except Exception: pass try: s = s.replace(',', '') result = pycnnum.cn2num(s) return result != 0 except Exception: return False
def get_synonyms(word_out, line): replace_near_years, replace_years, replace_kw_new = replace() # 进一步解析 所有与问题中关键词相关的标签 my_dict = {} word_out2 = [] for x in word_out: if x in replace_near_years: if '近' in x: t = re.sub('\D', '', x) if t: min_year = 2018 - int(t) else: arabic_num = cn2num(x[1]) min_year = 2018 - arabic_num min_year = '>=' + str(min_year) + '年' ind_x = word_out.index(x) word_out[ind_x] = min_year # 将‘近x年’转变为‘>=20xx’ word_out2.append(min_year) my_dict['year'] = min_year # print(min_year) elif '今年' in x: ind_x = word_out.index(x) word_out[ind_x] = '2018年' word_out2.append('2018年') my_dict['year'] = '2018年' # print('2018年') elif x in replace_years: min_year = x + '年' ind_x = word_out.index(x) word_out[ind_x] = min_year # 将‘近x年’转变为‘20xx’ word_out2.append(min_year) my_dict['year'] = x + '年' # print(x) else: for i in range(len(replace_kw_new)): if x in replace_kw_new[i]: for kk in replace_kw_new[i]: word_out2.append(kk) return my_dict, word_out2 # 所有与问题中关键词相关的标签
def metainfo(self, exceptions=["JiuWudaishi", "Sanguozhi"]): """ Produces tuples with file number (for reference), book title in Chinese, section (e.g., biography, annals, treatises), and scroll number. Saves to flat_meta in the `book` attributes. Designed for standard histories; might be adjusted for other purposes. NB `book` here is the variable under which the Book object was created. E.g., "Jts" for "JiuTangshu". """ print(self.bookname) if self.paths == []: self.extract_paths() #metalist = [] self.flat_meta = [] for idx, page in enumerate(self.flat_bodies): metatuple = namedtuple("metatuple", ["filenumber", "title", "section", "scroll"]) try: if self.bookname in exceptions: x = self.paths[idx] xsplit = x.split("/") title = xsplit[2] section = xsplit[3] section2 = section.split("\u3000")[0] scrollsplit = xsplit[4] scrollsplit2= scrollsplit.split("\u3000")[0] scrollsplit3 = re.sub("卷", "", scrollsplit2) scrollsplit4 = pycnnum.cn2num(scrollsplit3) fileno = str(idx).zfill(4) if "傳" in scrollsplit: section3 = "傳" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "紀" in scrollsplit: section3 = "紀" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "志" in scrollsplit: section3 = "志" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 else: metatuple = fileno, title, section2, scrollsplit4 #metalist.append(metatuple) self.flat_meta.append(list(metatuple)) else: x = self.paths[idx] xsplit = x.split("/") title = xsplit[2] section = xsplit[3] section2 = section.split("\u3000")[0] scrollsplit = xsplit[4] scrollsplit2= scrollsplit.split("\u3000")[0] scrollsplit3 = re.sub("卷", "", scrollsplit2) scrollsplit4 = pycnnum.cn2num(scrollsplit3) fileno = str(idx).zfill(4) metatuple = fileno, title, section2, scrollsplit4 #metalist.append(metatuple) self.flat_meta.append(list(metatuple)) except IndexError as error: print(error) #metalist.append("N/A") self.flat_meta.append(None) # changed from "N/A" print(len(self.flat_meta))
def metainfo(self, exceptions=["JiuWudaishi", "Sanguozhi"]): """ Produces tuples with file number (for reference), book title in Chinese, section (e.g., biography, annals, treatises), and scroll number. Saves to flat_meta in the `book` attributes. Designed for standard histories; might be adjusted for other purposes. NB `book` here is the variable under which the Book object was created. E.g., "Jts" for "JiuTangshu". NB: a lot of repetition here == need to redo. """ print(self.bookname) if self.paths == []: self.extract_paths() #metalist = [] self.flat_meta = [] for idx, page in enumerate(self.flat_bodies): metatuple = namedtuple( "metatuple", ["filenumber", "title", "section", "scroll"]) try: if self.bookname == "Sanguozhi": #exception for Sanguozhi #if self.bookname in exceptions: # general exception x = self.paths[idx] xsplit = x.split("/") title = xsplit[2] section = xsplit[3] section2 = section.split("\u3000")[0] scrollsplit = xsplit[4] scrollsplit2 = scrollsplit.split("\u3000")[0] scrollsplit3 = re.sub("卷", "", scrollsplit2) #below: provisional solution for some irregular numbers problemNumerals = [ "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八", "一百九" ] correctedNumerals = [ "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七", "一百零八", "一百零九" ] if scrollsplit3 in problemNumerals: #print(scrollsplit3) scrollsplit3 = scrollsplit3.replace( scrollsplit3, correctedNumerals[ problemNumerals.index(scrollsplit3)]) #print(scrollsplit3) #end provisional solution scrollsplit4 = pycnnum.cn2num(scrollsplit3) fileno = str(idx).zfill(4) if "傳" in scrollsplit: section3 = "傳" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "紀" in scrollsplit: section3 = "紀" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "志" in scrollsplit: section3 = "志" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 else: metatuple = fileno, title, section2, scrollsplit4 #metalist.append(metatuple) self.flat_meta.append(list(metatuple)) elif self.bookname == "JiuWudaishi": x = self.paths[idx] xsplit = x.split("/") title = xsplit[2] section = xsplit[3] section2 = section.split("\u3000")[0] if len(xsplit ) == 7: # handling an exception in JiuWudaishi scrollsplit = xsplit[5] else: scrollsplit = xsplit[4] scrollsplit2 = scrollsplit.split("\u3000")[0] scrollsplit3 = re.sub("卷", "", scrollsplit2) #below: provisional solution for some irregular numbers problemNumerals = [ "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八", "一百九" ] correctedNumerals = [ "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七", "一百零八", "一百零九" ] if scrollsplit3 in problemNumerals: #print(scrollsplit3) scrollsplit3 = scrollsplit3.replace( scrollsplit3, correctedNumerals[ problemNumerals.index(scrollsplit3)]) #print(scrollsplit3) #end provisional solution scrollsplit4 = pycnnum.cn2num(scrollsplit3) fileno = str(idx).zfill(4) if "傳" in scrollsplit: section3 = "傳" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "紀" in scrollsplit: section3 = "紀" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 elif "志" in scrollsplit: section3 = "志" + ": " + section2 metatuple = fileno, title, section3, scrollsplit4 else: metatuple = fileno, title, section2, scrollsplit4 #metalist.append(metatuple) self.flat_meta.append(list(metatuple)) else: x = self.paths[idx] xsplit = x.split("/") title = xsplit[2] section = xsplit[3] section2 = section.split("\u3000")[0] scrollsplit = xsplit[4] scrollsplit2 = scrollsplit.split("\u3000")[0] scrollsplit3 = re.sub("卷", "", scrollsplit2) #below: provisional solution for some irregular numbers problemNumerals = [ "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八", "一百九" ] correctedNumerals = [ "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七", "一百零八", "一百零九" ] if scrollsplit3 in problemNumerals: #print(scrollsplit3) scrollsplit3 = scrollsplit3.replace( scrollsplit3, correctedNumerals[ problemNumerals.index(scrollsplit3)]) #print(scrollsplit3) #end provisional solution scrollsplit4 = pycnnum.cn2num(scrollsplit3) fileno = str(idx).zfill(4) metatuple = fileno, title, section2, scrollsplit4 #metalist.append(metatuple) self.flat_meta.append(list(metatuple)) except IndexError as error: print(error) #metalist.append("N/A") self.flat_meta.append(None) # changed from "N/A" print(len(self.flat_meta))