def extractnumber_de(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ aWords = text.split() aWords = [ word for word in aWords if word not in ["der", "die", "das", "des", "den", "dem"] ] and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) elif isFractional_de(word): val = isFractional_de(word) elif isOrdinal_de(word): val = isOrdinal_de(word) else: if word in de_numbers: val = de_numbers[word] if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = isFractional_de(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "and" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'und': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'und': and_pass = True valPreAnd = val val = False count += 3 continue break if not val: return False return val
def extract_number_en(text, short_scale=True, ordinals=False): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers Args: text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 Returns: (int) or (float) or False: The extracted number or False if no number was found """ def _normalize(text): text = text.lower() erases = ["the", "of", "a", "an", "to", "positive", "plus"] replaces = { "exponentiated": "power", "raised": "power", "elevated": "power", "by": "times" # scientific notation } check_duplicates = ["power"] # cardinals if short_scale: cards = [ SHORT_ORDINAL_STRING_EN[c] for c in SHORT_ORDINAL_STRING_EN.keys() ] else: cards = [ LONG_ORDINAL_STRING_EN[c] for c in LONG_ORDINAL_STRING_EN.keys() ] words = text.split(" ") for idx, word in enumerate(words): prev_word = words[idx - 1] if idx > 0 else "" if word == "power" and prev_word in cards: i = cards.index(prev_word) + 1 # TODO > 20 if i <= 20: words[idx - 1] = NUM_STRING_EN[i] elif prev_word == "power" and word in cards: i = cards.index(word) + 1 # TODO > 20 if i <= 20: words[idx] = word = NUM_STRING_EN[i] if word in erases: words[idx] = "" elif word in replaces.keys(): words[idx] = replaces[word] if replaces[word] in check_duplicates and \ replaces[word] in " ".join(words[:idx]): words[idx] = "" if word in check_duplicates and word in " ".join(words[:idx]): words[idx] = "" return " ".join(words).rstrip().lstrip() text = _normalize(text) string_num_en = { "half": 0.5, "halves": 0.5, "hundreds": 100, "thousands": 1000, 'millions': 1000000 } for num in NUM_STRING_EN: num_string = NUM_STRING_EN[num] string_num_en[num_string] = num # first, second... if ordinals: if short_scale: for num in SHORT_ORDINAL_STRING_EN: num_string = SHORT_ORDINAL_STRING_EN[num] string_num_en[num_string] = num else: for num in LONG_ORDINAL_STRING_EN: num_string = LONG_ORDINAL_STRING_EN[num] string_num_en[num_string] = num # negate next number (-2 = 0 - 2) negatives = ["negative", "minus"] # sum the next number (twenty two = 20 + 2) sums = [ 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety' ] # multiply the previous number (one hundred = 1 * 100) multiplies = [ "hundred", "thousand", "hundreds", "thousands", "million", "millions" ] # split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) fraction_marker = [" and "] # decimal marker ( 1 point 5 = 1 + 0.5) decimal_marker = [" point ", " dot "] if short_scale: for num in SHORT_SCALE_EN: num_string = SHORT_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") else: for num in LONG_SCALE_EN: num_string = LONG_SCALE_EN[num] string_num_en[num_string] = num string_num_en[num_string + "s"] = num multiplies.append(num_string) multiplies.append(num_string + "s") # 2 and 3/4 for c in fraction_marker: components = text.split(c) if len(components) == 2: # ensure first is not a fraction and second is a fraction num1 = extract_number_en(components[0]) num2 = extract_number_en(components[1]) if num1 is not False and num2 is not False \ and num1 >= 1 and 0 < num2 < 1: return num1 + num2 # 2 point 5 for c in decimal_marker: components = text.split(c) if len(components) == 2: number = extract_number_en(components[0]) decimal = extract_number_en(components[1]) if number is not False and decimal is not False: # TODO handle number dot number number number if "." not in str(decimal): return number + float("0." + str(decimal)) aWords = text.split() aWords = [word for word in aWords if word not in ["the", "a", "an"]] val = False prev_val = None to_sum = [] for idx, word in enumerate(aWords): if not word: continue prev_word = aWords[idx - 1] if idx > 0 else "" next_word = aWords[idx + 1] if idx + 1 < len(aWords) else "" # is this word already a number ? if is_numeric(word): # if word.isdigit(): # doesn't work with decimals val = float(word) # is this word the name of a number ? if word in string_num_en: val = string_num_en[word] # is the prev word a number and should we sum it? # twenty two, fifty six if prev_word in sums and word in string_num_en: if val and val < 10: val = prev_val + val # is the prev word a number and should we multiply it? # twenty hundred, six hundred if word in multiplies: if not prev_val: prev_val = 1 val = prev_val * val # is this a spoken fraction? # half cup if val is False: val = is_fractional_en(word, short_scale=short_scale) # 2 fifths if not ordinals: next_value = is_fractional_en(next_word, short_scale=short_scale) if next_value: if not val: val = 1 val = val * next_value # is this a negative number? if val and prev_word and prev_word in negatives: val = 0 - val # let's make sure it isn't a fraction if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) else: prev_val = val # handle long numbers # six hundred sixty six # two million five hundred thousand if word in multiplies and next_word not in multiplies: to_sum.append(val) val = 0 prev_val = 0 # scientific notation elif prev_word == "times" and \ word in ["ten", "10"] and next_word == "power": power = int(extract_number_en(" ".join(aWords[idx:]))) val = extract_number_en(" ".join(aWords[:idx])) if val: return float(str(val) + "e" + str(power)) elif prev_word == "times" and \ word in ["ten", "10"] and \ extract_number_en(" ".join(aWords[idx:])) \ and text.endswith("power"): power = int(extract_number_en(" ".join(aWords[idx:]))) val = extract_number_en(" ".join(aWords[:idx])) if val: return float(str(val) + "e" + str(power)) if val is not None: for v in to_sum: val = val + v return val
def extractnumber_fr(text): """Takes in a string and extracts a number. Args: text (str): the string to extract a number from Returns: (str): The number extracted or the original text. """ # normalize text, keep articles for ordinals versus fractionals text = normalize_fr(text, False) # split words by whitespace aWords = text.split() count = 0 result = None add = False while count < len(aWords): val = None word = aWords[count] wordNext = "" wordPrev = "" if count < (len(aWords) - 1): wordNext = aWords[count + 1] if count > 0: wordPrev = aWords[count - 1] if word in articles_fr: count += 1 continue if word in ["et", "plus", "+"]: count += 1 add = True continue # is current word a numeric number? if word.isdigit(): val = int(word) count += 1 elif is_numeric(word): val = float(word) count += 1 elif wordPrev in articles_fr and getOrdinal_fr(word): val = getOrdinal_fr(word) count += 1 # is current word the denominator of a fraction? elif isFractional_fr(word): val = isFractional_fr(word) count += 1 # is current word the numerator of a fraction? if val and wordNext: valNext = isFractional_fr(wordNext) if valNext: val = float(val) * valNext count += 1 if not val: count += 1 # is current word a numeric fraction like "2/3"? aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) # is current word followed by a decimal value? if wordNext == "virgule": zeros = 0 newWords = aWords[count + 1:] # count the number of zeros after the decimal sign for word in newWords: if word == "zéro" or word == "0": zeros += 1 else: break afterDotVal = None # extract the number after the zeros if newWords[zeros].isdigit(): afterDotVal = newWords[zeros] countDot = count + zeros + 2 # if a number was extracted (since comma is also a # punctuation sign) if afterDotVal: count = countDot if not val: val = 0 # add the zeros afterDotString = zeros * "0" + afterDotVal val = float(str(val) + "." + afterDotString) if val: if add: result += val add = False else: result = val # if result == False: if not result: return normalize_fr(text, True) return result
def extractnumber_sv(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() and_pass = False valPreAnd = False val = False count = 0 while count < len(aWords): word = aWords[count] if is_numeric(word): val = float(word) elif word == "första": val = 1 elif word == "andra": val = 2 elif word == "tredje": val = 3 elif word == "fjärde": val = 4 elif word == "femte": val = 5 elif word == "sjätte": val = 6 elif is_fractional_sv(word): val = is_fractional_sv(word) else: if word == "en": val = 1 if word == "ett": val = 1 elif word == "två": val = 2 elif word == "tre": val = 3 elif word == "fyra": val = 4 elif word == "fem": val = 5 elif word == "sex": val = 6 elif word == "sju": val = 7 elif word == "åtta": val = 8 elif word == "nio": val = 9 elif word == "tio": val = 10 if val: if count < (len(aWords) - 1): wordNext = aWords[count + 1] else: wordNext = "" valNext = is_fractional_sv(wordNext) if valNext: val = val * valNext aWords[count + 1] = "" if not val: # look for fractions like "2/3" aPieces = word.split('/') if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) elif and_pass: # added to value, quit here val = valPreAnd break else: count += 1 continue aWords[count] = "" if and_pass: aWords[count - 1] = '' # remove "och" val += valPreAnd elif count + 1 < len(aWords) and aWords[count + 1] == 'och': and_pass = True valPreAnd = val val = False count += 2 continue elif count + 2 < len(aWords) and aWords[count + 2] == 'och': and_pass = True valPreAnd = val val = False count += 3 continue break if not val: return False return val
def extractnumber_pt(text): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize Returns: (int) or (float): The value of extracted number """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in pt_numbers: val = pt_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_pt(word): if not result: result = 1 result = result * isFractional_pt(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if val: if result is None: result = 0 # handle fractions if next_word != "avos": result += val else: result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_pt(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["ponto", "virgula", u"v�rgula", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_pt(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result
def extractnumber_it(text): """ Questa funzione prepara il testo dato per l'analisi rendendo numeri testuali come interi o frazioni. In italiano non è un modo abituale ma può essere interessante per Mycroft E' la versione portoghese riadattata in italiano args: text (str): la stringa da normalizzare Ritorna: (int) o (float): il valore del numero estratto """ aWords = text.split() count = 0 result = None while count < len(aWords): val = 0 word = aWords[count] next_next_word = None if count + 1 < len(aWords): next_word = aWords[count + 1] if count + 2 < len(aWords): next_next_word = aWords[count + 2] else: next_word = None # is current word a number? if word in it_numbers: if word == "mila": val = it_numbers[word] val = result * val result = 0 else: val = it_numbers[word] elif word.isdigit(): # doesn't work with decimals val = int(word) elif is_numeric(word): val = float(word) elif isFractional_it(word): if not result: result = 1 result = result * isFractional_it(word) # "un terzo" is 1/3 but "il terzo" is 3 if aWords[count - 1] == "il": result = 1.0 // isFractional_it(word) count += 1 continue if not val: # look for fractions like "2/3" aPieces = word.split('/') # if (len(aPieces) == 2 and is_numeric(aPieces[0]) # and is_numeric(aPieces[1])): if look_for_fractions(aPieces): val = float(aPieces[0]) / float(aPieces[1]) if not val: # cerca numero composto come ventuno ventitre centoventi" val = extractnumber_long_it(word) if val: if result is None: result = 0 # handle fractions # if next_word != "avos": result += val # else: # result = float(result) / float(val) if next_word is None: break # number word and fraction ands = ["e"] if next_word in ands: zeros = 0 if result is None: count += 1 continue newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_it(newText[:-1]) if afterAndVal: if result < afterAndVal or result < 20: while afterAndVal > 1: afterAndVal = afterAndVal / 10.0 for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break for _ in range(0, zeros): afterAndVal = afterAndVal / 10.0 result += afterAndVal break elif next_next_word is not None: if next_next_word in ands: newWords = aWords[count + 3:] newText = "" for word in newWords: newText += word + " " afterAndVal = extractnumber_it(newText[:-1]) if afterAndVal: if result is None: result = 0 result += afterAndVal break decimals = ["punto", "virgola", ".", ","] if next_word in decimals: zeros = 0 newWords = aWords[count + 2:] newText = "" for word in newWords: newText += word + " " for word in newWords: if word == "zero" or word == "0": zeros += 1 else: break afterDotVal = str(extractnumber_it(newText[:-1])) afterDotVal = zeros * "0" + afterDotVal result = float(str(result) + "." + afterDotVal) break count += 1 if result is None: return False # Return the $str with the number related words removed # (now empty strings, so strlen == 0) # aWords = [word for word in aWords if len(word) > 0] # text = ' '.join(aWords) if "." in str(result): integer, dec = str(result).split(".") # cast float to int if dec == "0": result = int(integer) return result