Exemplo n.º 1
0
def extract_people(content):

    dictionary = {}
    valuesnewold = (["上百", "100"], ["上千", "1000"], ["上万", "10000"])
    listo = ("上百", "上千", "上万")

    for value in valuesnewold:
        dictionary[value[0]] = value[1]

    term = []
    text11 = None
    word = None
    try:
        text = re.sub("\n", "", content)
        #text  = re.search(".{6}十|百|千|万[.{0,2}业主|.{0,2}司机|.{0,2}名|.{0,2}开发商|.{0,2}投资者|\
        #.{0,2}朋友|.{0,2}民众|.{0,2}父母|.{0,2}老|.{0,3}学生|.{0,3}家|.{0,3}工[友|人|民]|.{0,3}师|\
        #.{0,3}户|.{0,4}人|.{0,4}民|多家]", text)

        text1 = re.search(
            "[1|2|3|4|5|6|7|8|9]\d{0,1}0{0,6}[民|名|户|人|业主|司机|开发商|投资者|朋友|民众|父母|老|学生|家|工友|工人|工民|师].{5}",
            text)
        text1 = re.sub("\d{1,10}人民警察", "", text1.group(0))
        text1 = re.sub(
            ".{0,2}岁|.{0,4}人口|.{0,4}元|.{0,4}人民币|.{0,4}金|.{0,4}工程|.{0,4}块|.{0,4}钱|.{0,4}工资",
            "", text1)
        text11 = re.search("\d{2,8}", text1)
        text11 = text11.group(0)
    except Exception:
        pass

    try:
        text2 = re.search(
            "[一|二|两|三|四|五|六|七|八|九|上][十|百|千|万].{0,4}[民|名|户|人|业主|司机|开发商|投资者|朋友|民众|父母|老|学生|家|工友|工人|工民|师]",
            text)
        text2 = re.sub(
            ".{0,2}岁|.{0,4}人口|.{0,4}元|.{0,4}人民币|.{0,4}金|.{0,4}工程|.{0,4}块|.{0,4}钱|.{0,4}工资",
            "", text2.group(0))

        word = cn2num(text2)
        for element in listo:
            if element in text2:
                word = dictionary[element]

        word = re.search("\d{2,8}", word)
        word = word.group(0)

    except Exception:
        pass

    if word != None and int(word) <= 999999:
        term.append(int(word))
    if text11 != None and int(text11) <= 999999:
        term.append(int(text11))
    if term != []:
        term = max(term)
    else:
        term = "NA"

    return term
Exemplo n.º 2
0
    def dateList(self, fileno):  # node is a Beautifulsoup tagset object

        COLLECTION = []
        soup = BeautifulSoup(self.tag(fileno), 'lxml')
        TEs = soup.find_all('time')
        #print(TEs)
        for te in TEs:
            (era, year, month,
             day) = list(regex_date2.findall(te.text)[0])  # comps = components
            (year0, month0) = ('', '')
            if year.startswith('元年'):  # year
                year0 = 1
            elif year == '':  # empty string
                year0 = ''
            elif year[-1] in '春夏秋冬':
                year0 = cn2num(year[:-2])  # remove r'年[春夏秋冬]'
                pos = '春夏秋冬'.index(year[-1])
                month0 = 3 * (
                    pos +
                    1) - 1  # turns 春 into 2, 夏 into 5, 秋 into 8, 冬 into 11
            else:
                year0 = cn2num(year[:-1])
            #print(f"(era, year, month, day) = ({era}, {year}, {month}, {day})")
            #assert year != ''
            if month != '':  # month
                if month.startswith('閏'):
                    month_num_zh = month[1:-1]
                    if month_num_zh in ['正', '一']:
                        month0 = 1
                    else:
                        month0 = cn2num(
                            month_num_zh) + 0.5  # remove 閏 and 月 first
                elif month.startswith('正'):  # 正月 is the same as 一月
                    month0 = 1
                else:
                    month0 = cn2num(month)
            elif month == '':  # day
                #month0 = ''
                pass  # nothing to do
            if day.endswith('日'):
                day = cn2num(day[:-1])

            COLLECTION.append([era, year0, month0, day])
        return (COLLECTION)
Exemplo n.º 3
0
def new_cn2arab(query):

    if query.isdigit():
        return float(query)

    if len(query) == 0:
        return query

    result = []
    numstring = []
    for i in range(len(query)):
        char = query[i]
        if char not in digit_list:
            if len(numstring) > 0:
                numstring = ''.join([str(num) for num in numstring])
                result.append(pycnnum.cn2num(numstring))
                numstring = []
            result.append(char)
        else:
            if char == '点':
                try:
                    pre = query[i - 1]
                    post = query[i + 1]
                    if pre in digit_list and post in digit_list:
                        numstring.append(char)
                    else:
                        result.append(char)
                    continue
                except:
                    continue
            # if char in convert_list:
            #     char = convert_list[char]
            if i < len(query) - 1:
                test = char + query[i + 1]
                if test in skip_gram:
                    result.append(char)
                    continue
            numstring.append(char)

    if len(numstring) > 0:
        numstring = ''.join([str(num) for num in numstring])
        result.append(pycnnum.cn2num(numstring))
    result = [str(r) for r in result]
    return "".join(result)
def ToNum(s):
    s = s.replace(',','')
    try:
        result = int(float(s))
        return result
    except:
        pass
    try:
        result = pycnnum.cn2num(s)
        return result
    except Exception as e:
        raise e
Exemplo n.º 5
0
def get_number_from_word_zh(word, improve_number_extraction):
    punctuation = string.punctuation.replace('-', '')
    word = word.strip(punctuation)
    word = word.replace(",", "")
    try:
        number = word_to_num(word)
    except ValueError:
        try:
            number = int(word)
        except ValueError:
            try:
                number = float(word)
            except ValueError:
                try:
                    if isAllNumber(word):
                        number = pycnnum.cn2num(word)
                    else:
                        return None
                except ValueError:
                    if improve_number_extraction:
                        if re.match('^\d*1st$', word):  # ending in '1st'
                            number = int(word[:-2])
                        elif re.match('^\d*2nd$', word):  # ending in '2nd'
                            number = int(word[:-2])
                        elif re.match('^\d*3rd$', word):  # ending in '3rd'
                            number = int(word[:-2])
                        elif re.match('^\d+th$', word):  # ending in <digits>th
                            # Many occurrences are when referring to centuries (e.g "the *19th* century")
                            number = int(word[:-2])
                        elif len(word) > 1 and word[-2] == '0' and re.match('^\d+s$', word):
                            # Decades, e.g. "1960s".
                            # Other sequences of digits ending with s (there are 39 of these in the training
                            # set), do not seem to be arithmetically related, as they are usually proper
                            # names, like model numbers.
                            number = int(word[:-1])
                        elif len(word) > 4 and re.match('^\d+(\.?\d+)?/km[²2]$', word):
                            # per square kilometer, e.g "73/km²" or "3057.4/km2"
                            if '.' in word:
                                number = float(word[:-4])
                            else:
                                number = int(word[:-4])
                        elif len(word) > 6 and re.match('^\d+(\.?\d+)?/month$', word):
                            # per month, e.g "1050.95/month"
                            if '.' in word:
                                number = float(word[:-6])
                            else:
                                number = int(word[:-6])
                        else:
                            return None
                    else:
                        return None
    return number
Exemplo n.º 6
0
    def _text_to_digit(self, word):
        try:
            if word == '〇' or word == '零':
                return '0'
            num = cn2num(word)
            if num != 0:
                return str(num)
        except:
            pass

        try:
            return self.t2d.convert(word.lower())
        except:
            return None
Exemplo n.º 7
0
def convert_index(content):
    match = num_re.search(content)
    if match:
        result = match.groupdict()
        return int(result["num"])

    match = cnnum_re.search(content)
    if match:
        result = match.groupdict()
        content = result["num"]
    try:
        return cn2num(content)
    except:
        return 0
def IsNum(s):
    if not s:
        return False
    try:
        s = s.replace(',', '')
        float(s)
        return True
    except Exception:
        pass
    try:
        s = s.replace(',', '')
        result = pycnnum.cn2num(s)
        return result != 0
    except Exception:
        return False
Exemplo n.º 9
0
def get_synonyms(word_out, line):
    replace_near_years, replace_years, replace_kw_new = replace()
    # 进一步解析 所有与问题中关键词相关的标签
    my_dict = {}
    word_out2 = []
    for x in word_out:
        if x in replace_near_years:
            if '近' in x:
                t = re.sub('\D', '', x)
                if t:
                    min_year = 2018 - int(t)
                else:
                    arabic_num = cn2num(x[1])
                    min_year = 2018 - arabic_num
                min_year = '>=' + str(min_year) + '年'
                ind_x = word_out.index(x)
                word_out[ind_x] = min_year  # 将‘近x年’转变为‘>=20xx’
                word_out2.append(min_year)
                my_dict['year'] = min_year
                # print(min_year)
            elif '今年' in x:
                ind_x = word_out.index(x)
                word_out[ind_x] = '2018年'
                word_out2.append('2018年')
                my_dict['year'] = '2018年'
                # print('2018年')
        elif x in replace_years:
            min_year = x + '年'
            ind_x = word_out.index(x)
            word_out[ind_x] = min_year  # 将‘近x年’转变为‘20xx’
            word_out2.append(min_year)
            my_dict['year'] = x + '年'
            # print(x)
        else:
            for i in range(len(replace_kw_new)):
                if x in replace_kw_new[i]:
                    for kk in replace_kw_new[i]:
                        word_out2.append(kk)
    return my_dict, word_out2  # 所有与问题中关键词相关的标签
Exemplo n.º 10
0
    def metainfo(self, exceptions=["JiuWudaishi", "Sanguozhi"]):
        """
        Produces tuples with file number (for reference), book title in Chinese, 
        section (e.g., biography, annals, treatises), and scroll number. Saves
        to flat_meta in the `book` attributes. Designed for standard histories; might be adjusted for other purposes.

        NB `book` here is the variable under which the Book object was created.
        E.g., "Jts" for "JiuTangshu".
        """

        print(self.bookname)
        if self.paths == []:
            self.extract_paths()
        #metalist = [] 
        self.flat_meta = []
        for idx, page in enumerate(self.flat_bodies):
            metatuple = namedtuple("metatuple", ["filenumber", "title", "section", "scroll"])
            try:
                if self.bookname in exceptions:
                    x = self.paths[idx]
                    xsplit = x.split("/")
                    title = xsplit[2]
                    section = xsplit[3]
                    section2 = section.split("\u3000")[0]
                    scrollsplit = xsplit[4]
                    scrollsplit2= scrollsplit.split("\u3000")[0]
                    scrollsplit3 =  re.sub("卷", "", scrollsplit2)
                    scrollsplit4 = pycnnum.cn2num(scrollsplit3)
                    fileno = str(idx).zfill(4)
                    if "傳" in scrollsplit:
                        section3 = "傳" + ": " + section2
                        metatuple =  fileno, title, section3, scrollsplit4
                    elif "紀" in scrollsplit:
                        section3 = "紀" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    elif "志" in scrollsplit:
                        section3 = "志" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    else:
                        metatuple = fileno, title, section2, scrollsplit4
                    #metalist.append(metatuple)
                    self.flat_meta.append(list(metatuple))
                else:            
                    x = self.paths[idx]
                    xsplit = x.split("/")
                    title = xsplit[2]
                    section = xsplit[3]
                    section2 = section.split("\u3000")[0]
                    scrollsplit = xsplit[4]
                    scrollsplit2= scrollsplit.split("\u3000")[0]
                    scrollsplit3 =  re.sub("卷", "", scrollsplit2)
                    scrollsplit4 = pycnnum.cn2num(scrollsplit3)
                    fileno = str(idx).zfill(4)
                    metatuple = fileno, title, section2, scrollsplit4
                    #metalist.append(metatuple)
                    self.flat_meta.append(list(metatuple))
            except IndexError as error:
                    print(error)
                    #metalist.append("N/A")
                    self.flat_meta.append(None) # changed from "N/A"
        print(len(self.flat_meta))
Exemplo n.º 11
0
    def metainfo(self, exceptions=["JiuWudaishi", "Sanguozhi"]):
        """
        Produces tuples with file number (for reference), book title in Chinese, 
        section (e.g., biography, annals, treatises), and scroll number. Saves
        to flat_meta in the `book` attributes. Designed for standard histories; might be adjusted for other purposes.

        NB `book` here is the variable under which the Book object was created.
        E.g., "Jts" for "JiuTangshu".
        NB: a lot of repetition here == need to redo.
        """

        print(self.bookname)
        if self.paths == []:
            self.extract_paths()
        #metalist = []
        self.flat_meta = []
        for idx, page in enumerate(self.flat_bodies):
            metatuple = namedtuple(
                "metatuple", ["filenumber", "title", "section", "scroll"])
            try:
                if self.bookname == "Sanguozhi":  #exception for Sanguozhi
                    #if self.bookname in exceptions: # general exception
                    x = self.paths[idx]
                    xsplit = x.split("/")
                    title = xsplit[2]
                    section = xsplit[3]
                    section2 = section.split("\u3000")[0]
                    scrollsplit = xsplit[4]
                    scrollsplit2 = scrollsplit.split("\u3000")[0]
                    scrollsplit3 = re.sub("卷", "", scrollsplit2)

                    #below: provisional solution for some irregular numbers
                    problemNumerals = [
                        "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八",
                        "一百九"
                    ]
                    correctedNumerals = [
                        "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七",
                        "一百零八", "一百零九"
                    ]
                    if scrollsplit3 in problemNumerals:
                        #print(scrollsplit3)
                        scrollsplit3 = scrollsplit3.replace(
                            scrollsplit3, correctedNumerals[
                                problemNumerals.index(scrollsplit3)])
                        #print(scrollsplit3)
                    #end provisional solution

                    scrollsplit4 = pycnnum.cn2num(scrollsplit3)
                    fileno = str(idx).zfill(4)
                    if "傳" in scrollsplit:
                        section3 = "傳" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    elif "紀" in scrollsplit:
                        section3 = "紀" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    elif "志" in scrollsplit:
                        section3 = "志" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    else:
                        metatuple = fileno, title, section2, scrollsplit4
                    #metalist.append(metatuple)
                    self.flat_meta.append(list(metatuple))

                elif self.bookname == "JiuWudaishi":
                    x = self.paths[idx]
                    xsplit = x.split("/")
                    title = xsplit[2]
                    section = xsplit[3]
                    section2 = section.split("\u3000")[0]
                    if len(xsplit
                           ) == 7:  # handling an exception in JiuWudaishi
                        scrollsplit = xsplit[5]
                    else:
                        scrollsplit = xsplit[4]
                    scrollsplit2 = scrollsplit.split("\u3000")[0]
                    scrollsplit3 = re.sub("卷", "", scrollsplit2)

                    #below: provisional solution for some irregular numbers
                    problemNumerals = [
                        "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八",
                        "一百九"
                    ]
                    correctedNumerals = [
                        "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七",
                        "一百零八", "一百零九"
                    ]
                    if scrollsplit3 in problemNumerals:
                        #print(scrollsplit3)
                        scrollsplit3 = scrollsplit3.replace(
                            scrollsplit3, correctedNumerals[
                                problemNumerals.index(scrollsplit3)])
                        #print(scrollsplit3)
                    #end provisional solution

                    scrollsplit4 = pycnnum.cn2num(scrollsplit3)
                    fileno = str(idx).zfill(4)
                    if "傳" in scrollsplit:
                        section3 = "傳" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    elif "紀" in scrollsplit:
                        section3 = "紀" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    elif "志" in scrollsplit:
                        section3 = "志" + ": " + section2
                        metatuple = fileno, title, section3, scrollsplit4
                    else:
                        metatuple = fileno, title, section2, scrollsplit4
                    #metalist.append(metatuple)
                    self.flat_meta.append(list(metatuple))
                else:
                    x = self.paths[idx]
                    xsplit = x.split("/")
                    title = xsplit[2]
                    section = xsplit[3]
                    section2 = section.split("\u3000")[0]
                    scrollsplit = xsplit[4]
                    scrollsplit2 = scrollsplit.split("\u3000")[0]
                    scrollsplit3 = re.sub("卷", "", scrollsplit2)

                    #below: provisional solution for some irregular numbers
                    problemNumerals = [
                        "一百一", "一百二", "一百三", "一百四", "一百五", "一百六", "一百七", "一百八",
                        "一百九"
                    ]
                    correctedNumerals = [
                        "一百零一", "一百零二", "一百零三", "一百零四", "一百零五", "一百零六", "一百零七",
                        "一百零八", "一百零九"
                    ]
                    if scrollsplit3 in problemNumerals:
                        #print(scrollsplit3)
                        scrollsplit3 = scrollsplit3.replace(
                            scrollsplit3, correctedNumerals[
                                problemNumerals.index(scrollsplit3)])
                        #print(scrollsplit3)
                    #end provisional solution

                    scrollsplit4 = pycnnum.cn2num(scrollsplit3)
                    fileno = str(idx).zfill(4)
                    metatuple = fileno, title, section2, scrollsplit4
                    #metalist.append(metatuple)
                    self.flat_meta.append(list(metatuple))
            except IndexError as error:
                print(error)
                #metalist.append("N/A")
                self.flat_meta.append(None)  # changed from "N/A"
        print(len(self.flat_meta))