예제 #1
0
def extractnumber_de(text):
    """
    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
    Returns:
        (int) or (float): The value of extracted number


    undefined articles cannot be suppressed in German:
    'ein Pferd' means 'one horse' and 'a horse'

    """
    aWords = text.split()
    aWords = [
        word for word in aWords
        if word not in ["der", "die", "das", "des", "den", "dem"]
    ]
    and_pass = False
    valPreAnd = False
    val = False
    count = 0
    while count < len(aWords):
        word = aWords[count]
        if is_numeric(word):
            # if word.isdigit():            # doesn't work with decimals
            val = float(word)
        elif isFractional_de(word):
            val = isFractional_de(word)
        elif isOrdinal_de(word):
            val = isOrdinal_de(word)
        else:
            if word in de_numbers:
                val = de_numbers[word]
                if count < (len(aWords) - 1):
                    wordNext = aWords[count + 1]
                else:
                    wordNext = ""
                valNext = isFractional_de(wordNext)

                if valNext:
                    val = val * valNext
                    aWords[count + 1] = ""

        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])
            elif and_pass:
                # added to value, quit here
                val = valPreAnd
                break
            else:
                count += 1
                continue

        aWords[count] = ""

        if and_pass:
            aWords[count - 1] = ''  # remove "and"
            val += valPreAnd
        elif count + 1 < len(aWords) and aWords[count + 1] == 'und':
            and_pass = True
            valPreAnd = val
            val = False
            count += 2
            continue
        elif count + 2 < len(aWords) and aWords[count + 2] == 'und':
            and_pass = True
            valPreAnd = val
            val = False
            count += 3
            continue

        break

    if not val:
        return False

    return val
예제 #2
0
def extract_number_en(text, short_scale=True, ordinals=False):
    """
    This function extracts a number from a text string,
    handles pronunciations in long scale and short scale

    https://en.wikipedia.org/wiki/Names_of_large_numbers

    Args:
        text (str): the string to normalize
        short_scale (bool): use short scale if True, long scale if False
        ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
    Returns:
        (int) or (float) or False: The extracted number or False if no number
                                   was found

    """
    def _normalize(text):
        text = text.lower()
        erases = ["the", "of", "a", "an", "to", "positive", "plus"]
        replaces = {
            "exponentiated": "power",
            "raised": "power",
            "elevated": "power",
            "by": "times"  # scientific notation
        }
        check_duplicates = ["power"]
        # cardinals
        if short_scale:
            cards = [
                SHORT_ORDINAL_STRING_EN[c]
                for c in SHORT_ORDINAL_STRING_EN.keys()
            ]
        else:
            cards = [
                LONG_ORDINAL_STRING_EN[c]
                for c in LONG_ORDINAL_STRING_EN.keys()
            ]
        words = text.split(" ")
        for idx, word in enumerate(words):
            prev_word = words[idx - 1] if idx > 0 else ""
            if word == "power" and prev_word in cards:
                i = cards.index(prev_word) + 1
                # TODO > 20
                if i <= 20:
                    words[idx - 1] = NUM_STRING_EN[i]
            elif prev_word == "power" and word in cards:
                i = cards.index(word) + 1
                # TODO > 20
                if i <= 20:
                    words[idx] = word = NUM_STRING_EN[i]
            if word in erases:
                words[idx] = ""
            elif word in replaces.keys():
                words[idx] = replaces[word]
                if replaces[word] in check_duplicates and \
                        replaces[word] in " ".join(words[:idx]):
                    words[idx] = ""
            if word in check_duplicates and word in " ".join(words[:idx]):
                words[idx] = ""

        return " ".join(words).rstrip().lstrip()

    text = _normalize(text)
    string_num_en = {
        "half": 0.5,
        "halves": 0.5,
        "hundreds": 100,
        "thousands": 1000,
        'millions': 1000000
    }

    for num in NUM_STRING_EN:
        num_string = NUM_STRING_EN[num]
        string_num_en[num_string] = num

    # first, second...
    if ordinals:
        if short_scale:
            for num in SHORT_ORDINAL_STRING_EN:
                num_string = SHORT_ORDINAL_STRING_EN[num]
                string_num_en[num_string] = num
        else:
            for num in LONG_ORDINAL_STRING_EN:
                num_string = LONG_ORDINAL_STRING_EN[num]
                string_num_en[num_string] = num

    # negate next number (-2 = 0 - 2)
    negatives = ["negative", "minus"]

    # sum the next number (twenty two = 20 + 2)
    sums = [
        'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty',
        'ninety'
    ]

    # multiply the previous number (one hundred = 1 * 100)
    multiplies = [
        "hundred", "thousand", "hundreds", "thousands", "million", "millions"
    ]

    # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 )
    fraction_marker = [" and "]

    # decimal marker ( 1 point 5 = 1 + 0.5)
    decimal_marker = [" point ", " dot "]

    if short_scale:
        for num in SHORT_SCALE_EN:
            num_string = SHORT_SCALE_EN[num]
            string_num_en[num_string] = num
            string_num_en[num_string + "s"] = num
            multiplies.append(num_string)
            multiplies.append(num_string + "s")
    else:
        for num in LONG_SCALE_EN:
            num_string = LONG_SCALE_EN[num]
            string_num_en[num_string] = num
            string_num_en[num_string + "s"] = num
            multiplies.append(num_string)
            multiplies.append(num_string + "s")

    # 2 and 3/4
    for c in fraction_marker:
        components = text.split(c)

        if len(components) == 2:
            # ensure first is not a fraction and second is a fraction
            num1 = extract_number_en(components[0])
            num2 = extract_number_en(components[1])
            if num1 is not False and num2 is not False \
                    and num1 >= 1 and 0 < num2 < 1:
                return num1 + num2

    # 2 point 5
    for c in decimal_marker:
        components = text.split(c)
        if len(components) == 2:
            number = extract_number_en(components[0])
            decimal = extract_number_en(components[1])
            if number is not False and decimal is not False:
                # TODO handle number dot number number number
                if "." not in str(decimal):
                    return number + float("0." + str(decimal))

    aWords = text.split()
    aWords = [word for word in aWords if word not in ["the", "a", "an"]]
    val = False
    prev_val = None
    to_sum = []
    for idx, word in enumerate(aWords):

        if not word:
            continue
        prev_word = aWords[idx - 1] if idx > 0 else ""
        next_word = aWords[idx + 1] if idx + 1 < len(aWords) else ""

        # is this word already a number ?
        if is_numeric(word):
            # if word.isdigit():            # doesn't work with decimals
            val = float(word)

        # is this word the name of a number ?
        if word in string_num_en:
            val = string_num_en[word]

        # is the prev word a number and should we sum it?
        # twenty two, fifty six
        if prev_word in sums and word in string_num_en:
            if val and val < 10:
                val = prev_val + val

        # is the prev word a number and should we multiply it?
        # twenty hundred, six hundred
        if word in multiplies:
            if not prev_val:
                prev_val = 1
            val = prev_val * val

        # is this a spoken fraction?
        # half cup
        if val is False:
            val = is_fractional_en(word, short_scale=short_scale)

        # 2 fifths
        if not ordinals:
            next_value = is_fractional_en(next_word, short_scale=short_scale)
            if next_value:
                if not val:
                    val = 1
                val = val * next_value

        # is this a negative number?
        if val and prev_word and prev_word in negatives:
            val = 0 - val

        # let's make sure it isn't a fraction
        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        else:
            prev_val = val
            # handle long numbers
            # six hundred sixty six
            # two million five hundred thousand
            if word in multiplies and next_word not in multiplies:
                to_sum.append(val)
                val = 0
                prev_val = 0
            # scientific notation
            elif prev_word == "times" and \
                    word in ["ten", "10"] and next_word == "power":

                power = int(extract_number_en(" ".join(aWords[idx:])))
                val = extract_number_en(" ".join(aWords[:idx]))
                if val:
                    return float(str(val) + "e" + str(power))
            elif prev_word == "times" and \
                    word in ["ten", "10"] and \
                    extract_number_en(" ".join(aWords[idx:])) \
                    and text.endswith("power"):

                power = int(extract_number_en(" ".join(aWords[idx:])))
                val = extract_number_en(" ".join(aWords[:idx]))
                if val:
                    return float(str(val) + "e" + str(power))

    if val is not None:
        for v in to_sum:
            val = val + v
    return val
예제 #3
0
def extractnumber_fr(text):
    """Takes in a string and extracts a number.
    Args:
        text (str): the string to extract a number from
    Returns:
        (str): The number extracted or the original text.
    """
    # normalize text, keep articles for ordinals versus fractionals
    text = normalize_fr(text, False)
    # split words by whitespace
    aWords = text.split()
    count = 0
    result = None
    add = False
    while count < len(aWords):
        val = None
        word = aWords[count]
        wordNext = ""
        wordPrev = ""
        if count < (len(aWords) - 1):
            wordNext = aWords[count + 1]
        if count > 0:
            wordPrev = aWords[count - 1]

        if word in articles_fr:
            count += 1
            continue
        if word in ["et", "plus", "+"]:
            count += 1
            add = True
            continue

        # is current word a numeric number?
        if word.isdigit():
            val = int(word)
            count += 1
        elif is_numeric(word):
            val = float(word)
            count += 1
        elif wordPrev in articles_fr and getOrdinal_fr(word):
            val = getOrdinal_fr(word)
            count += 1
        # is current word the denominator of a fraction?
        elif isFractional_fr(word):
            val = isFractional_fr(word)
            count += 1

        # is current word the numerator of a fraction?
        if val and wordNext:
            valNext = isFractional_fr(wordNext)
            if valNext:
                val = float(val) * valNext
                count += 1

        if not val:
            count += 1
            # is current word a numeric fraction like "2/3"?
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        # is current word followed by a decimal value?
        if wordNext == "virgule":
            zeros = 0
            newWords = aWords[count + 1:]
            # count the number of zeros after the decimal sign
            for word in newWords:
                if word == "zéro" or word == "0":
                    zeros += 1
                else:
                    break
            afterDotVal = None
            # extract the number after the zeros
            if newWords[zeros].isdigit():
                afterDotVal = newWords[zeros]
                countDot = count + zeros + 2
            # if a number was extracted (since comma is also a
            # punctuation sign)
            if afterDotVal:
                count = countDot
                if not val:
                    val = 0
                # add the zeros
                afterDotString = zeros * "0" + afterDotVal
                val = float(str(val) + "." + afterDotString)
        if val:
            if add:
                result += val
                add = False
            else:
                result = val

    # if result == False:
    if not result:
        return normalize_fr(text, True)

    return result
예제 #4
0
def extractnumber_sv(text):
    """
    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
    Returns:
        (int) or (float): The value of extracted number
    """
    aWords = text.split()
    and_pass = False
    valPreAnd = False
    val = False
    count = 0
    while count < len(aWords):
        word = aWords[count]
        if is_numeric(word):
            val = float(word)
        elif word == "första":
            val = 1
        elif word == "andra":
            val = 2
        elif word == "tredje":
            val = 3
        elif word == "fjärde":
            val = 4
        elif word == "femte":
            val = 5
        elif word == "sjätte":
            val = 6
        elif is_fractional_sv(word):
            val = is_fractional_sv(word)
        else:
            if word == "en":
                val = 1
            if word == "ett":
                val = 1
            elif word == "två":
                val = 2
            elif word == "tre":
                val = 3
            elif word == "fyra":
                val = 4
            elif word == "fem":
                val = 5
            elif word == "sex":
                val = 6
            elif word == "sju":
                val = 7
            elif word == "åtta":
                val = 8
            elif word == "nio":
                val = 9
            elif word == "tio":
                val = 10
            if val:
                if count < (len(aWords) - 1):
                    wordNext = aWords[count + 1]
                else:
                    wordNext = ""
                valNext = is_fractional_sv(wordNext)

                if valNext:
                    val = val * valNext
                    aWords[count + 1] = ""

        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])
            elif and_pass:
                # added to value, quit here
                val = valPreAnd
                break
            else:
                count += 1
                continue

        aWords[count] = ""

        if and_pass:
            aWords[count - 1] = ''  # remove "och"
            val += valPreAnd
        elif count + 1 < len(aWords) and aWords[count + 1] == 'och':
            and_pass = True
            valPreAnd = val
            val = False
            count += 2
            continue
        elif count + 2 < len(aWords) and aWords[count + 2] == 'och':
            and_pass = True
            valPreAnd = val
            val = False
            count += 3
            continue

        break

    if not val:
        return False

    return val
예제 #5
0
def extractnumber_pt(text):
    """
    This function prepares the given text for parsing by making
    numbers consistent, getting rid of contractions, etc.
    Args:
        text (str): the string to normalize
    Returns:
        (int) or (float): The value of extracted number

    """
    aWords = text.split()
    count = 0
    result = None
    while count < len(aWords):
        val = 0
        word = aWords[count]
        next_next_word = None
        if count + 1 < len(aWords):
            next_word = aWords[count + 1]
            if count + 2 < len(aWords):
                next_next_word = aWords[count + 2]
        else:
            next_word = None

        # is current word a number?
        if word in pt_numbers:
            val = pt_numbers[word]
        elif word.isdigit():  # doesn't work with decimals
            val = int(word)
        elif is_numeric(word):
            val = float(word)
        elif isFractional_pt(word):
            if not result:
                result = 1
            result = result * isFractional_pt(word)
            count += 1
            continue

        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        if val:
            if result is None:
                result = 0
            # handle fractions
            if next_word != "avos":
                result += val
            else:
                result = float(result) / float(val)

        if next_word is None:
            break

        # number word and fraction
        ands = ["e"]
        if next_word in ands:
            zeros = 0
            if result is None:
                count += 1
                continue
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "

            afterAndVal = extractnumber_pt(newText[:-1])
            if afterAndVal:
                if result < afterAndVal or result < 20:
                    while afterAndVal > 1:
                        afterAndVal = afterAndVal / 10.0
                    for word in newWords:
                        if word == "zero" or word == "0":
                            zeros += 1
                        else:
                            break
                for _ in range(0, zeros):
                    afterAndVal = afterAndVal / 10.0
                result += afterAndVal
                break
        elif next_next_word is not None:
            if next_next_word in ands:
                newWords = aWords[count + 3:]
                newText = ""
                for word in newWords:
                    newText += word + " "
                afterAndVal = extractnumber_pt(newText[:-1])
                if afterAndVal:
                    if result is None:
                        result = 0
                    result += afterAndVal
                    break

        decimals = ["ponto", "virgula", u"v�rgula", ".", ","]
        if next_word in decimals:
            zeros = 0
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "
            for word in newWords:
                if word == "zero" or word == "0":
                    zeros += 1
                else:
                    break
            afterDotVal = str(extractnumber_pt(newText[:-1]))
            afterDotVal = zeros * "0" + afterDotVal
            result = float(str(result) + "." + afterDotVal)
            break
        count += 1

    if result is None:
        return False

    # Return the $str with the number related words removed
    # (now empty strings, so strlen == 0)
    # aWords = [word for word in aWords if len(word) > 0]
    # text = ' '.join(aWords)
    if "." in str(result):
        integer, dec = str(result).split(".")
        # cast float to int
        if dec == "0":
            result = int(integer)

    return result
예제 #6
0
def extractnumber_it(text):
    """
    Questa funzione prepara il testo dato per l'analisi rendendo
    numeri testuali come interi o frazioni.
    In italiano non è un modo abituale ma può essere interessante
    per Mycroft
    E' la versione portoghese riadattata in italiano
     args:
         text (str): la stringa da normalizzare
    Ritorna:
         (int) o (float): il valore del numero estratto

    """
    aWords = text.split()
    count = 0
    result = None
    while count < len(aWords):
        val = 0
        word = aWords[count]
        next_next_word = None
        if count + 1 < len(aWords):
            next_word = aWords[count + 1]
            if count + 2 < len(aWords):
                next_next_word = aWords[count + 2]
        else:
            next_word = None

        # is current word a number?
        if word in it_numbers:
            if word == "mila":
                val = it_numbers[word]
                val = result * val
                result = 0
            else:
                val = it_numbers[word]

        elif word.isdigit():  # doesn't work with decimals
            val = int(word)
        elif is_numeric(word):
            val = float(word)

        elif isFractional_it(word):
            if not result:
                result = 1
            result = result * isFractional_it(word)
            # "un terzo" is 1/3 but "il terzo" is 3
            if aWords[count - 1] == "il":
                result = 1.0 // isFractional_it(word)

            count += 1
            continue

        if not val:
            # look for fractions like "2/3"
            aPieces = word.split('/')
            # if (len(aPieces) == 2 and is_numeric(aPieces[0])
            #   and is_numeric(aPieces[1])):
            if look_for_fractions(aPieces):
                val = float(aPieces[0]) / float(aPieces[1])

        if not val:
            # cerca numero composto come ventuno ventitre centoventi"
            val = extractnumber_long_it(word)

        if val:
            if result is None:
                result = 0
            # handle fractions
            # if next_word != "avos":
            result += val
            # else:
            #    result = float(result) / float(val)

        if next_word is None:
            break

        # number word and fraction
        ands = ["e"]
        if next_word in ands:
            zeros = 0
            if result is None:
                count += 1
                continue
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "

            afterAndVal = extractnumber_it(newText[:-1])
            if afterAndVal:
                if result < afterAndVal or result < 20:
                    while afterAndVal > 1:
                        afterAndVal = afterAndVal / 10.0
                    for word in newWords:
                        if word == "zero" or word == "0":
                            zeros += 1
                        else:
                            break
                for _ in range(0, zeros):
                    afterAndVal = afterAndVal / 10.0
                result += afterAndVal
                break
        elif next_next_word is not None:
            if next_next_word in ands:
                newWords = aWords[count + 3:]
                newText = ""
                for word in newWords:
                    newText += word + " "
                afterAndVal = extractnumber_it(newText[:-1])
                if afterAndVal:
                    if result is None:
                        result = 0
                    result += afterAndVal
                    break

        decimals = ["punto", "virgola", ".", ","]
        if next_word in decimals:
            zeros = 0
            newWords = aWords[count + 2:]
            newText = ""
            for word in newWords:
                newText += word + " "
            for word in newWords:
                if word == "zero" or word == "0":
                    zeros += 1
                else:
                    break
            afterDotVal = str(extractnumber_it(newText[:-1]))
            afterDotVal = zeros * "0" + afterDotVal
            result = float(str(result) + "." + afterDotVal)
            break
        count += 1

    if result is None:
        return False

    # Return the $str with the number related words removed
    # (now empty strings, so strlen == 0)
    # aWords = [word for word in aWords if len(word) > 0]
    # text = ' '.join(aWords)
    if "." in str(result):
        integer, dec = str(result).split(".")
        # cast float to int
        if dec == "0":
            result = int(integer)

    return result