def get_threshold(tok, cp_word, date_figures): parse = next(parser.parse(tok)) #First, we parse the whole clause # And then we search the grammatical context of cp_word # This is most of the time a Prepositional Phrase (PP), a Nominal Phrase (NP) or a Quantifier Phrase (NP) pp = None sub = parse.subtrees() for s in sub: if (s.label() == "PP" and s.leaves()[0] == cp_word): pp = s if pp == None: pps = get_subtrees(parse, "PP") for p in pps: if cp_word in p.leaves(): pp = p if pp == None: nps = get_subtrees(parse, "NP") for n in nps: if cp_word in n.leaves(): pp = n if pp == None: qps = get_subtrees(parse, "QP") for q in qps: if cp_word in q.leaves(): pp = q #If a context is found, we look for the first number appearing after cp_word and not being a date if pp != None: i = get_index(pp.leaves(), cp_word) #position of the comp word in the context fig = get_nodes(pp, "CD") #list of all numbers appearing in the context n = 0 for f in fig: if (n == 0 and get_index(pp.leaves(), f) > i and (f not in date_figures)): n = f #and if that number exists, we check if an unit multiplier is written just after if n != 0: k = get_index(tok, n) #position of the number in the clause mult = 1 try: mult = unit_m[tok[k + 1].lower()] except: pass return (float(n) * mult) return None
def find_time(ner, parse, parse_d): n = len(ner) res = [] date_from = None date_to = None date_than = [] # First, we look at the dates detected by the NER for i in range(n): if ner[i][1] == "DATE": res.append(ner[i][0]) pps = get_subtrees(parse, "PP") # the dates that are years are put in the list years years = [] for a in res: if year_format(a): years.append(a) for b in years: res.remove(b) tok = parse.leaves() # if there is a "than" and years placed after that, they go in date_than # and they are removed from years if ('than' in tok): idx = get_index(tok, 'than') for i in range(len(years)): if get_index(tok, years[i]) > idx: date_than.append(int(years[i])) for b in date_than: years.remove(str(b)) # if only 1 year remains, we look at its context if len(years) == 1: y = years[0] link = None # the context of a year will often be the Prepositional Phrase (PP) it belongs to for pp in pps: if y in pp.leaves(): link = pp.leaves() # if no PP was found, we can also look at the words linked to the year in the dependency structure if link == None: link = find_links(parse_d, y) # and then, depending on the words found in the context of the year, we know its function if link != []: lower_link = lower_list(link) if 'in' in lower_link: date_from = int(y) date_to = int(y) # for example, if the year is associated with 'since' (like in "Population in Iran since 1960"), it will be the DATE_FROM elif 'since' in lower_link: date_from = int(y) elif 'after' in lower_link: date_from = int(y) elif 'till' in lower_link: date_to = int(y) elif 'before' in lower_link: date_to = int(y) else: date_from = int(y) date_to = int(y) # if we have 2 years or more, we take the minimum and maximum to have the time period elif len(years) >= 2: date_from = min(int(years[0]), int(years[1])) date_to = max(int(years[0]), int(years[1])) # if we have no years but other things tagged as a date # we try to find a duration (like in "over the last 8 decades") elif res != []: duration = 0 lowered = lower_list(res) fig = [] # we need to have the word "last" if "last" in res: # Now we know that we probably have a stucture "over the last ..." # We can try to find a figure (like the number of years) # first, we try to find numbers in the words tagged as dates (they are obviously not years here) for r in res: try: b = int(r) fig.append(b) except: pass # if no numbers was found, we try to find a PP containing all the words of the date if fig == []: pp_date = None for pp in pps: c = True leaves = lower_list(pp.leaves()) for l in lowered: if l not in leaves: c = False if c == True: pp_date = lower_list(pp.leaves()) # and if such a PP was found, we try to find numbers in it if pp_date != None: if 'last' in pp_date: for a in pp_date: try: b = int(a) fig.append(b) except: pass # if no numbers was found, it will be 1 n = 1 if len(fig) == 1: n = fig[0] # now we try to find duration words (with the function) dur = get_duration(res) # if such a word was found, we take its duration and multiply it by the number found # therefore, "8 decades" will be 80 while "5 centuries" will be 500 if dur != None: # if no number was found (hence the default value of 1), it depends whether the word is singular or plural # for plural, the default value is 5 # over the last decades = 50, over the last 3 decades = 30, over the last decade = 10 if n == 1: if dur[1] == 's': duration = dur[0] elif dur[1] == 'p': duration = dur[0] * 5 else: duration = dur[0] * n if duration > 0: date_to = NOW date_from = NOW - duration return (date_from, date_to, date_than)
def find_areas(sent): s, tok = replacement(sent) parse = next(parser.raw_parse(sent)) areas = find_areas_in_list(tok) pps = get_subtrees(parse, "PP") areas_in = [] areas_to = [] areas_than = [] if 'than' in tok: idx = get_index(tok, "than") #position of the "than" else: idx = len( tok ) + 10 #if no "than", we put it after all the words (so the condition is never met) for a in areas: name = a[0] #name of the area type = a[1] #country or region form = a[2] #adjective 'a' or name 'n' classification = None if get_index(tok, name) > idx: classification = "THAN" else: #looking at the PP (context) of the area p = None for pp in pps: b = True for mot in name.split(" "): if mot not in pp.leaves(): b = False if b: p = pp.leaves() if p != None: if (('to' in p) or ('into' in p) or ('towards' in p)): #words that would indicate a category "TO" classification = "TO" elif (('between' in p) and ('and' in p)): if first_word(name, 'and', tok) == 'and': classification = "TO" else: classification = "IN" #most of the time, the default case is the category "IN" else: classification = "IN" else: classification = "IN" #Finally, before adding the area to the list, we change its name to the good format #Indeed, until now, the name of the area was written as in the sentence, to find it easily #Now, we take the official writing, the same as in the dictionaries #Thus, if the word is "INDIA" or "india" or "INDia", now it becomes "India" name_f = [] if form == 'a': for adj in demo_list: if adj.lower() == name.lower(): name_f = [demo_dict[adj], type] elif form == 'n': if type == 'country': for c in country_list: if c.lower() == name.lower(): name_f = [c, type] elif type == 'region': for r in region_list: if r.lower() == name.lower(): name_f = [r, type] if classification == "IN": areas_in.append(name_f) elif classification == "TO": areas_to.append(name_f) elif classification == "THAN": areas_than.append(name_f) return (areas_in, areas_to, areas_than)
def find_aggregators(parse, parse_d, returned, agg_words): tok = parse.leaves() ner = ner_tagger.tag(tok) pos = pos_tagger.tag(tok) dep = list(dep_parser.parse(tok))[0] # We store the numbers in the sentence that are dates, as it is useful when looking for a threshold figures = date_figures(ner, pos, dep) # When a comparison or aggregation is in the sentence, the user normally wants a list of something # But sometimes, there is not any words specifing the type of the list and so the return is set as a value by default # Here, we set temporarly that return value to a list of countries # Thus will be useful if a comparison/aggregation is found # An example query for such a case would be "Highest GDPs in the world" if returned == "Value": returned = "Agr_Area" ## Comparative words # Some comparative words are "threshold-only" and do not require a construction with "than" th_words = ["over", "under", "below", "above"] th_inf = ["under", "below"] # We detect these words th_ = catch_words(tok, th_words) th = [] # And just make sure that a threshold is linked to each one (as these words can appear is other contexts) for t in th_: if get_threshold(tok, t, figures) != None: th.append(t) # The other comparative words (that we will name comp words) require a structure with "than" # Some of them have to be specified (like "superior") but most of them are recognizied easily # thanks to specific tags for comparison in the POS tags cp_words = ["superior", "inferior"] cp_inf = ["less", "lower", "inferior", "poorer"] comp_ = get_nodes(parse, "RBR") + get_nodes(parse, "JJR") + catch_words( tok, cp_words) comp = [] # Then, we only keep the comparative words followed by a "than" # And we also reorder the words at the same time, adding the threshold words in the common list k = 0 #determines if a comp word has already been found (used when a "than" is found) cp = "" #current comp word for t in tok: if t in comp_: if k == 0: k = 1 cp = t if k == 1: cp = t elif t in th: if k == 1: #this case happens if a threshold word is found after a comp word but before a potential than #in that case, we cannot reasonably consider the comp word as it would create nested comparisons k = 0 cp = "" comp.append(t) elif t == "than": if k == 0: raise Exception( "Error 0 : than alone" ) #in case a "than" is found but without a comp word before elif k == 1: k = 0 comp.append(cp) cp = "" ## Comparisons # Now that we have all the comparative words, we try to cut the sentence in clauses # Each clause must contain only one comparison (often there is just one clause) comparisons = [] n_comp = len(comp) clauses, cuts = cut_in_clause(tok, comp, cut_words) if n_comp > 0: if len(clauses) == n_comp: b = True for i in range(n_comp): if comp[i] not in clauses[i]: b = False if not b: raise Exception("Error 1 : problem with clauses") # Else, everything is okay and we will now treat each clause separately else: for i in range(n_comp): clause = clauses[i] word = comp[i] # We parse the clause. That way, we only consider the words of the clause and nothing else # And of course, the result can differ from the parsing of the whole sentence clause_sent = " ".join(clause) clause_parse = next(parser.parse(clause)) clause_dep = list(dep_parser.parse(clause))[0] clause_ner = ner_tagger.tag(clause) # Then, we execute the functions find_areas and find_time for the clause areas = find_areas(clause_sent) times = find_time(clause_ner, clause_parse, clause_dep) than_time = times[2] to_time = times[1] in_time = times[0] than_area = areas[2] in_area = areas[0] # Here, we initialize the different variables that describe a comparison comp_type = None #what is the comparator (a threshold, another country/year, or something else) sens = 'sup' #is the comparison a "more than" or a "less than" V1 = { } #elements of Value1 (the first value of the comparison, before "than") V2 = { } #elements of Value2 (the second value of the comparison, after "than") V = { } #some elements are not part of the comparison and belongs to both values # Example : "Countries with more population than Germany in 2010" -> we compare everything at the year 2010 # Now, we differentiate the treatment between "list of countries" and "list of years" # Countries list if returned == 'Agr_Area': # If the comparative word is "threshold-only" if word in th_words: if word.lower() in th_inf: sens = "inf" # Search of a threshold threshold = get_threshold(clause, word, []) if threshold == None: raise Exception("Error 2 : No threshold found") else: comp_type = "Threshold" V2["THRESHOLD"] = threshold # Search of a time indicator (as we compare values, we cannot have a time series) if ((in_time != None) and (in_time == to_time)): V["TIME"] = in_time # Search of a location indicator # As the used wants a list of countries, he cannot specify a country in the query # But he can give a region ("What countries in Asia ...") region = True r = [] for c in in_area: if c[1] == 'country': region = False if not region: raise Exception( "Error 3 : Country was mentioned") else: for c in in_area: r.append(c[0]) V["AREA"] = r # Else, the comparative word must belong to a "than" structure else: if 'than' in clause: if word.lower() in cp_inf: sens = "inf" idx = get_index( clause, "than" ) #position of the "than", useful to fill V1 & V2 # First, we look at the locations # Here, it is possible to mention a country if it is the comparator if len(than_area) == 1: if than_area[0][1] == "country": V2["AREA"] = than_area[0][0] comp_type = "Country" else: raise Exception( "Error 4 : Comparison with a region" ) elif len(than_area) > 1: raise Exception( "Error 5 : Too many area mentioned") # It is also possible to mention a region, as before region = True r = [] for c in in_area: if c[1] == 'country': region = False if not region: raise Exception( "Error 3 : Country mentioned") else: for c in in_area: r.append(c[0]) V["AREA"] = r # Then, the time indicators # If two dates are found on both sides of "than", the first one go in V1 and the other in V2 has_than_time = False if (len(than_time) == 1): if in_time != None: if (get_index(clause, str(in_time)) < idx): V1["TIME"] = in_time V2["TIME"] = than_time[0] has_than_time = True if comp_type == None: comp_type = "Two" # Else, the year is general (goes in V) if not has_than_time: if len(than_time) == 1: V["TIME"] = than_time[0] elif ((in_time != None) and (in_time == to_time)): V["TIME"] = in_time else: #in case no date is given, either we raise an error or ask the user, or take a default one (to see later) #raise Exception("Error 6 : Must precise time period") pass # If we haven't found yet the type of comparison, we try to find a threshold # If there is not, the comparison is of type "two" (two different values compared) if comp_type == None: thres = get_threshold( clause, 'than', than_time) if thres != None: comp_type = "Threshold" V2["THRESHOLD"] = thres if comp_type == None: comp_type = "Two" else: raise Exception( "Error 7 : comparison without 'than'") # Years list elif returned == 'Agr_Time': # If threshold word if word in th_words: if word.lower() in th_inf: sens = "inf" threshold = get_threshold(clause, word, []) if threshold == None: raise Exception("Error 2 : No threshold found") else: comp_type = "Threshold" V2["THRESHOLD"] = threshold # As we have a list of years here, we can only have time indicators as a time period (more than one year) if ((in_time != None) and (to_time != None) and (in_time != to_time)): V["TIME"] = [in_time, to_time] else: V["TIME"] = None # And conversely, the location indicators can only give one country (to be able to compare) if (len(in_area) > 1 or (len(in_area) == 1 and in_area[0][1] == 'region')): raise Exception( "Error 5 : Too many area mentioned") else: if len(in_area) == 1: V["AREA"] = in_area[0][0] else: V["AREA"] = None # If than construction else: if 'than' in clause: if word.lower() in cp_inf: sens = "inf" idx = get_index(clause, "than") # Get countries # We accept if two countries are given on both sides of "than" : goes in V1 & V2 # Else it goes in V and can only be one country if len(than_area) == 1: if than_area[0][1] == "country": if (len(in_area) == 1 and in_area[0][1] == "country"): V2["AREA"] = than_area[0][0] V1["AREA"] = in_area[0][0] comp_type = "Two" elif (len(in_area) == 0): V["AREA"] = than_area[0][0] else: raise Exception( "Error 5 : Too many area mentioned" ) else: raise Exception( "Error 4 : Comparison with a region" ) elif len(than_area) > 1: raise Exception( "Error 5 : Too many area mentioned") elif (len(than_area) == 0): if (len(in_area) > 1 or (len(in_area) == 1 and in_area[0][1] == 'region')): raise Exception( "Error 5 : Too many area mentioned" ) else: if len(in_area) == 1: V["AREA"] = in_area[0][0] else: V["AREA"] = None # Get times #A specific year can be given by the user as the comparator (comp_type -> "Time") if (len(than_time) == 1): V2["TIME"] = than_time[0] comp_type = "Time" elif (len(than_time) > 1): raise Exception( "Error 8 : Too many times mentioned") #Else, we accept only a time period if ((in_time != None) and (to_time != None) and (in_time != to_time)): V["TIME"] = [in_time, to_time] else: V["TIME"] = None # If nothing, we do as before and look for a threshold if comp_type == None: thres = get_threshold( clause, 'than', than_time) if thres != None: comp_type = "Threshold" V2["THRESHOLD"] = thres if comp_type == None: comp_type = "Two" else: raise Exception( "Error 7 : comparison without 'than'") # At the end, we gather everything for that clause and add this to the comparisons list comparisons.append([comp_type, sens, V, V1, V2]) else: raise Exception("Error 9 : number of words and clauses") ## Superlative words # Aggregation words (or superlative words) are mostly found with their specific tag # Nonetheless, some have to be specified sp_words = ["top", "minimum", "maximum"] sup = get_nodes(parse, "RBS") + get_nodes(parse, "JJS") + catch_words( tok, sp_words) ## Aggregations aggreg = None sens_sup = None #sense of the aggregation (max or min) n_sup = 1 #number of items to display sup_neg = ["least", "lowest", "worst", "minimum"] #we also need to know the plural form of the words that could be linked to the aggregation agg_plural = ["areas", "countries", "places", "states", "nations", "years"] #Sense of the aggregation if (sup != []): for s in sup: if s.lower() in sup_neg: sens_sup = 'inf' if sens_sup == None: sens_sup = 'sup' # For the number of items, we look at the context of the superlative words + the words linked to them # These words usually form a context as a Nominal Phrase (NP) # And in the context, we look for numerical values sup_ = sup + agg_words nps = get_subtrees(parse, "NP") for s in sup_: for np in nps: if s in np.leaves(): for a in np.leaves(): try: n_sup = int(a) except: pass # If no number was found, we look at a potential plural form # That would correspond to a default value of 10 items if n_sup == 1: for w in agg_words: if w.lower() in agg_plural: n_sup = 10 if (sup != []): aggreg = [sens_sup, n_sup] #Finally, we return all the information found # 1) The list of comparison (one for each clause) # 2) The sense and value of the aggregation (if any) return (comparisons, aggreg)
def type_of_sentence(parse, parse_d): type = "NP" #type of sentence (NP or WH) returned = "Value" #return type (Value, Agr_Area, Agr_Time) count = None #is a count asked (True, False) agg_words = [] #store the identifiers detected in the sentence # Determines if the sentence contains a WH-structure nodes = [] for t in parse.subtrees(): nodes.append(t.label()) wh = is_wh(nodes) # The first case treated is the NP one (which is the default) # The reason is that you can have a WH word in a totally NP sentence ("Number of countries in |which| GDP is above ...") # So first we check if it is a NP, and if not, we try the WH words # NP Sentence if (parse[0].label() == "NP"): type = "NP" # We look at the tokens to see if we can find specific words (area or time identifier) # If we do, we look if they are connected to the word "number" like in "number of countries" to determine if there is a count or not tok = parse.leaves() b = True for t in tok: if ((t.lower() in area_identifier) and b): returned = "Agr_Area" agg_words.append(t) b = False links = lower_list(find_links(parse_d, t.lower())) if "number" in links: count = True if ((t.lower() in time_identifier) and b): returned = "Agr_Time" agg_words.append(t) b = False links = lower_list(find_links(parse_d, t.lower())) if "number" in links: count = True #if nothing found, the default return is a value if returned == None: returned = "Value" else: if count == None: count = False # Wh-Questions elif wh: type = "WH" #Capture all the possible WH-words of the sentence wh_words = get_nodes(parse, "WRB") + get_nodes( parse, "WP") + get_nodes(parse, "WDT") #Now depending on the WH-word we have, the treatment is different to get all the information #If multiple WH-words, we take the first one (which is the most at the beginning) #Example : "What are the countries where ...", we only consider "What" # HOW if (len(wh_words) > 0 and wh_words[0].lower() == "how"): #Generally, "how" is followed by an adjective (forming a WHADJP) #If so, we have to check if the adjective is "many" or "much" or something else ("how big", "how rich" ...) adjp = get_subtrees(parse, "WHADJP") if ( len(adjp) == 1 ): #only 1 WHADJP in the sentence (otherwise, it is complicated) jj = get_nodes(adjp[0], "JJ") if (jj != []): if (('many' in jj) or ('much' in jj)): # If we have a "how many" or "how much", we try to see if this is part of WHNP ("How many |something| does ...") # And we try to see if the corresponding word is a area/time identifier ("how many countries", "how much time" ...) try: np = get_subtrees(parse, "WHNP")[0] for n in np.leaves(): if n.lower() in area_identifier: returned = "Agr_Area" agg_words.append(n) count = True elif n.lower() in time_identifier: returned = "Agr_Time" agg_words.append(n) count = True #else, the default will always be "Value" if returned == None: returned = "Value" except: returned = "Value" else: returned = "Value" else: returned = "Value" else: returned = "Value" # For "What" and "Which", we try to see if a specific word is linked to that ("Which countries ...", "What are the places ...") or not ("What is the GDP of ...") # WHAT elif (len(wh_words) > 0 and wh_words[0].lower() == "what"): links = find_links(parse_d, "what") for l in links: if l.lower() in area_identifier: returned = "Agr_Area" agg_words.append(l) count = False elif l.lower() in time_identifier: returned = "Agr_Time" agg_words.append(l) count = False if returned == None: returned = "Value" # WHICH elif (len(wh_words) > 0 and wh_words[0].lower() == "which"): links = find_links(parse_d, "which") for l in links: if l.lower() in area_identifier: returned = "Agr_Area" agg_words.append(l) count = False elif l.lower() in time_identifier: returned = "Agr_Time" agg_words.append(l) count = False if returned == None: returned = "Value" # For the other WH-words, the meaning is directly expressed in the word ("Where" asks for a list of countries ...) # WHEN elif (len(wh_words) > 0 and wh_words[0].lower() == "when"): #print("WH-Questions : WHEN") returned = "Agr_Time" count = False # WHERE elif (len(wh_words) > 0 and wh_words[0].lower() == "where"): #print("WH-Questions : WHERE") returned = "Agr_Area" count = False # WHO elif (len(wh_words) > 0 and wh_words[0].lower() == "who"): #print("WH-Questions : WHO") returned = "Agr_Area" count = False # Yes/No Questions (not treated at the moment) elif (get_subtrees(parse, "SQ") != []): #print("Y/N Question") pass # Other sentences (order, verbal phrase...) # For this type of sentence, we still check if there is a area/time identifier (and a count) else: #print("Other") tok = parse.leaves() b = True for t in tok: if ((t.lower() in area_identifier) and b): returned = "Agr_Area" agg_words.append(t) b = False links = lower_list(find_links(parse_d, t.lower())) if "number" in links: count = True if ((t.lower() in time_identifier) and b): returned = "Agr_Time" agg_words.append(t) b = False links = lower_list(find_links(parse_d, t.lower())) if "number" in links: count = True if returned == None: returned = "Value" else: if count == None: count = False return (type, returned, count, agg_words)