def _is_past_participle_verb(verb): main_form = verb.lemma_ if main_form not in SLoader.get_verb_list(): return False if main_form not in SLoader.get_past_participle_list(): return False past_tense = SLoader.get_past_participle_list()[main_form] if past_tense == verb.lower_: return True return False
def _process_internal(question): assert(isinstance(question, spacy.tokens.span.Span)) if len(question) < 2: return str(question) + "@placeholder" question = "Which " + str(question) + " ?" nlp = SLoader.get_full_spacy_nlp() question = list(nlp(question).sents)[0] qtype = get_question_type(question) if qtype == QType.WHICH_OF: return which_of.process(question) if qtype == QType.IN_WHICH_OF: # No examples in the entire SQuAD dataset. return in_which_of.process(question) if qtype == QType.WHICH_NOUN: return which_noun.process(question) if qtype == QType.WHICH_BE: return which_be.process(question) if qtype == QType.IN_WHICH_NOUN: # No examples in the entire SQuAD dataset. return in_which_noun.process(question) if qtype == QType.WHICH_VERB: return which_verb.process(question) question = question[1:] # Remove "Which" question = ["@placeholder"] + [str(x) for x in question] question[-1] = '.' # Replace "?" with "." return ' '.join(question)
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 3) skipped, question = _split_question(question) assert (isinstance(skipped, list)) assert (isinstance(question, spacy.tokens.span.Span)) if len(skipped) >= 1: skipped[0] = "in" # Not "In". while len(question) >= 1 and question[-1].is_punct: question = question[:-1] question = [str(x) for x in question] if not question[0].isupper(): question[0] = question[0].capitalize() skipped = ' '.join(skipped) question = ' '.join(question) # Swap main question and "In ...". question = question + ", " + skipped + " ?" nlp = SLoader.get_full_spacy_nlp() doc = nlp(question) question = doc[0:len(doc)] # Convert to spaCy Span (not Doc). return _process_internal(question)
def _process_with_verb(question): to_remove = set() to_past = set() insert_before = {} verb = question[0] if verb.lower_ == "does" or verb.lower_ == "do": to_remove.add(verb) elif verb.lower_ == "did": # Put main verb to past tense. to_remove.add(verb) to_past.add(verb.head) elif verb.lemma_ == "be": insert_before[verb.head] = verb to_remove.add(verb) out = [] for token in question: if token in insert_before: out.append(insert_before[token].text) if token in to_remove: continue if token in to_past: past_tenses = SLoader.get_past_tense_list() out.append(past_tenses.get(token.text, token.text)) else: out.append(token.text) return deepcopy(out)
def _process_internal(question): assert (isinstance(question, spacy.tokens.span.Span)) if len(question) < 2: return str(question) + "@placeholder" question = "What " + str(question) + " ?" nlp = SLoader.get_full_spacy_nlp() question = list(nlp(question).sents)[0] qtype = get_question_type(question) if qtype == QType.WHAT_BE: return what_be.process(question) if qtype == QType.WHAT_DO: return what_do.process(question) if qtype == QType.IN_WHAT: return in_what.process(question) if qtype == QType.WHAT_NOUN: return what_noun.process(question) if qtype == QType.WHAT_VERB: return what_verb.process(question) question = question[1:] # Remove "What" question = ["@placeholder"] + [str(x) for x in question] question[-1] = '.' # Replace "?" with "." return ' '.join(question)
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 3) while len(question) >= 1 and question[-1].is_punct: question = question[:-1] if len(question) <= 2: return "@placeholder" insert_before = {} skip = set() to_past = set() for token in question: if token.pos_ == "VERB" and token.dep_.lower() == "aux": if token.lemma_ == "do": if token.lower_ == "did": to_past.add(token.head) skip.add(token) else: if token.lemma_ == "have" and token.head: if not _has_aux_pass(token.head): insert_before[token.head] = token skip.add(token) else: for child in token.head.children: if child.dep_.lower() == "auxpass" or (child.lower_ == "been"): insert_before[child] = token skip.add(token) break elif token.lemma_ == "be": pass # Seems ok to let it as it is. question = question[2:] # Skip "How much" out = [] for token in question: if token in insert_before: out.append(insert_before[token].text) if token in skip: continue if token in to_past: past_tense = SLoader.get_past_tense_list() out.append(past_tense.get(token.lower_, token.text)) else: out.append(token.text) question = ["@placeholder", "(", "long", ")"] + out + ["."] if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() return ' '.join(question)
def _process_with_verb(question): # Try to remove aux verbs. # E.g. In which state did Jordan played the most of his games? if len(question) >= 1 and question[0].pos_ == "VERB": to_remove = set() to_past = set() insert_before = {} insert_at_end = [] # in this order. verb = question[0] if verb.lower_ == "does" or verb.lower_ == "do": # In which location do students of the School of Architecture # of Notre Dame spend their 3rd year? # Do not bother with 1st/3rd person. # Leave it as it is. to_remove.add(verb) elif verb.lower_ == "did": # Put main verb to past tense. to_remove.add(verb) if verb != verb.head: to_past.add(verb.head) elif verb.lemma_ == "be": if verb.head and verb.dep_ in ["aux", "auxpass"] and ( verb.head != verb and verb.head.pos_ == "VERB"): # In which season was online voting introduced? insert_before[verb.head] = verb to_remove.add(verb) else: # In which direction is Puerto Rico from the island of # Saint-Barthélemy? to_remove.add(verb) insert_at_end.append(verb) out = [] for token in question: if token in insert_before: out.append(insert_before[token].text) if token in to_remove: continue if token in to_past: past_tenses = SLoader.get_past_tense_list() out.append(past_tenses.get(token.text, token.text)) else: out.append(token.text) for token in insert_at_end: out.append(token.text) return deepcopy(out) return [str(x) for x in question]
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 3) while len(question) >= 1 and question[-1].is_punct: question = question[:-1] if len(question) <= 2: return "@placeholder" verb = question[1] # Do/does/did ... question = question[2:] # Remove "When do/does/did" tokens. to_replace = {} if verb.head and verb.head != verb and verb.head.pos_ == "VERB": if verb.lower_ == "did": verb_text = verb.head.lower_ past_tense = SLoader.get_past_tense_list() verb_text = past_tense.get(verb_text, verb_text) to_replace[verb.head] = verb_text else: # Present tense. Do nothing. pass out = [] for token in question: if token in to_replace: out.append(to_replace[token]) else: out.append(token.text) question = out question.append("in") question.append("@placeholder") question.append(".") if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() question = ' '.join(question) return question
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 3) while len(question) >= 1 and question[-1].is_punct: question = question[:-1] if len(question) <= 2: return str(question) + " @placeholder ." PREP = { 'describe': 'as', 'identify': 'as', 'view': 'as', 'define': 'as', 'rate': 'as', 'compare': 'as', 'credit': 'as' # Was credited as ... } question = question[1:] # Remove "How". verb = question[0] if verb.lower_ in ["do", "does"]: question = [str(x) for x in question[1:]] question.append(PREP.get(verb.head.lemma_, "by")) question.append("@placeholder") question.append(".") return _to_capitalize(question) # assert(verb.lower_ == "did") if verb.head and verb.head != verb: main_verb = verb.head out = [] for token in question[1:]: if token == main_verb: past_tense = SLoader.get_past_tense_list() out.append(past_tense.get(token.lower_, token.text)) else: out.append(token.text) out.append(PREP.get(main_verb.lemma_, "by")) out.append("@placeholder") out.append(".") return _to_capitalize(out) # Mostly spaCy "wrong" dependency trees. # Search for the first verb. # No examples found. main_verb = None for token in question[1:]: if token.pos_ == "VERB": main_verb = token break if main_verb is not None: out = [] for token in question[1:]: if token == main_verb: past_tense = SLoader.get_past_tense_list() out.append(past_tense.get(token.lower_, token.text)) else: out.append(token.text) out.append(PREP.get(main_verb.lemma_, "by")) out.append("@placeholder") out.append(".") return _to_capitalize(out) # No verb is found. Do nothing (but remove "did"). # How did Descartes' distinguish types of existence? # How did Top 40 radio what ifmusic change during this era? # How did the actual sales of the G4's compare to the sales expectations? question = [str(x) for x in question[1:]] question.append(PREP.get(verb.head.lemma_, "by")) question.append("@placeholder") question.append(".") return _to_capitalize(question)
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 3) while len(question) >= 1 and question[-1].is_punct: question = question[:-1] if len(question) <= 2: return str(question) + " @placeholder ." # Skip HOW X VERB? (X) # How far from each other were the motors in Gramme's demonstrations? # => far from each other question = question[1:] # Remove "How" advj = question[0] measure = [] while len(question) >= 1: token = question[0] if token == advj or advj.is_ancestor(token): measure.append(token.text) question = question[1:] else: break assert (len(measure) >= 1) measure = ' '.join(measure) if len(question) <= 1: # How lond did the creation of Red Book CD - DA standard take? # long => lond => spaCy error. out = "@placeholder " + measure if len(question) > 0: out = out + " " + str(question) return out + " ." assert (isinstance(measure, str)) verb = question[0] if verb.pos_ != "VERB": # Caused by a spaCy wrong dependency tree. # How [far] away was the plant located from the epicenter? # Extend @measure until the first verb. while len(question) >= 1: token = question[0] if token.pos_ != "VERB": measure += (" " + token.text) question = question[1:] else: break if len(question) <= 1: # How far back to San Diego's roots in the arts and theater # sector go? out = "@placeholder " + measure if len(question) > 0: out = out + " " + str(question) return out + " ." assert (isinstance(measure, str)) assert (len(question) > 1) assert (question[0].pos_ == "VERB") verb = question[0] if verb.lemma_ == "do": if verb.head == verb: # How often do temperatures on the coastal plain of NC drop below # freezing at night? # Insert @placeholder at the end. question = [str(x) for x in question[1:]] if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() question.append("@placeholder") question.append("(") question.append(measure) question.append(")") question.append(".") return ' '.join(question) else: # Insert @placeholder at the end. # How far did the Arctic tern chick travel? # Correct main verb tense. main_verb = verb.head out = [] for token in question[1:]: if token == main_verb and verb.lower_ == "did": # To past. past_tense = SLoader.get_past_tense_list() out.append(past_tense.get(token.lemma_, token.text)) else: out.append(token.text) question = out if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() question.append("@placeholder") question.append("(") question.append(measure) question.append(")") question.append(".") return ' '.join(question) if verb.head != verb: # How high had cotton revenues risen by the time of the American # Civil War? main_verb = verb.head index = -1 for i in range(0, len(question)): if question[i] == main_verb: index = i break assert (index > 0 and index < len(question)) insert_before = main_verb if index >= 1 and _is_aux_verb(question[index - 1], main_verb): insert_before = question[index - 1] out = [] for token in question[1:]: if token == insert_before: out.append(verb.text) out.append(token.text) if token == main_verb: out.append("@placeholder") out.append("(") out.append(measure) out.append(")") out.append(".") question = out if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() return ' '.join(question) # Look for the subject (as a child of the verb). # How old are most of the native language speakers in northern Catalonia? subj = None for child in verb.children: if child.dep_.lower() in ["nsubj", "nsubjpass"]: subj = child break if subj is None: # How large in square kilometers is Greater Hyderabad? # How simple is the process of transformation? # Insert verb and @placeholder at the end. question = [str(x) for x in question[1:]] question.append(verb.text) question.append("@placeholder") question.append("(") question.append(measure) question.append(")") question.append(".") if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() return ' '.join(question) insert_after = subj for child in subj.subtree: if child.idx > insert_after.idx: insert_after = child out = [] for token in question[1:]: out.append(token.text) if token == insert_after: out.append(verb.text) out.append("@placeholder") out.append("(") out.append(measure) out.append(")") out.append(".") question = out if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() return ' '.join(question)
def process(question): assert (isinstance(question, spacy.tokens.span.Span)) assert (len(question) >= 4) while len(question) >= 1 and question[-1].is_punct: question = question[:-1] if len(question) <= 3: return "@placeholder" on = question[0] head = on.head question = question[2:] skipped = None if head != on: # On what magazine was she the cover model? skipped = [] while len(question) >= 1: if question[0].pos_ == "VERB": break if head.is_ancestor(question[0]) or head == question[0]: skipped.append(question[0].text) question = question[1:] else: break else: # Stop at the first verb. # On what devices can video games be used? # On what was the Philip Glass opera based? # On what occasions are š and ž replaced with sh and zh? # On what film was videoconferencing widely used? # On what was the mitrailleuse mounted? # On what do plants depend in their environment? index = 0 for token in question: if token.pos_ == "VERB": break index += 1 if index < len(question): skipped = [str(x) for x in question[0:index]] question = question[index:] else: # No verb found. # Does not happen in the entire SQuAD. skipped = [] assert (isinstance(skipped, list)) if len(question) == 0: # Does not happen in the entire SQuAD. question = [str(x) for x in question] question = ["On", "@placeholder"] + question question.append(".") question = ' '.join(question) return question to_remove = set() insert_before = {} to_past = set() insert_at_end = [] # In this order. verb = question[0] if verb.pos_ == "VERB": # 99.99% of the cases. if verb.lower_ in ["do", "does"]: # Just remove. to_remove.add(verb) elif verb.lower_ == "did": to_remove.add(verb) if verb.head != verb and verb.head.pos_ == "VERB": # Normal case. to_past.add(verb.head) else: # 99% a spaCy tagging error. # Search for the first verb. main_verb = None for token in question[1:]: if token.pos_ == "VERB": # Very unlikely since it seems like # a spaCy tagging problem. main_verb = token break if token.lemma_ in SLoader.get_verb_list() and ( token in verb.children or token == verb.head): # Some verbs are tagged as nouns. # On what did a rescue helicopter crash with no " # survivors? main_verb = token break if main_verb is not None: to_past.add(main_verb) else: # Do nothing. pass elif verb.lemma_ == "be": # On what date were the Belavezha Accords signed? # On what year was the USSR dissolved? to_remove.add(verb) if verb.head != verb and verb.head.pos_ == "VERB": insert_before[verb.head] = verb else: # Search for the first verb. main_verb = None for token in question[1:]: if _is_past_participle_verb(token): main_verb = token break if token.pos_ == "VERB": # Another verb entry. <VERB1> <VERB2> break if main_verb is not None: # On what day and month was Spectre released to the # Chinese market released? # On what date is Twilight Princess HD scheduled # for Australian release? insert_before[main_verb] = verb else: # No main verb linked to "be" was found. # On what magazine was she the cover model? # On what day was the funeral of Donda West? # On what season was Kristy Lee Cook a contestant on # American Idol? insert_at_end.append(verb) else: # On what part of newer iPods can you find the buttons? # On what devices can video games be used? # On what day would most of the games televised on the ESPN # networks be played? # On what day would AFL games be shown on NFL Network? # Search for the first verb. to_remove.add(verb) main_verb = None for token in question[1:]: if token.pos_ == "VERB": main_verb = token break if main_verb is not None: insert_before[main_verb] = verb else: insert_at_end.append(verb) out = [] for token in question: if token in insert_before: out.append(insert_before[token].text) if token in to_remove: continue if token in to_past: past_tense = SLoader.get_past_tense_list() out.append(past_tense.get(token.lower_, token.text)) else: out.append(token.text) for token in insert_at_end: out.append(token.text) question = out + ["on", "@placeholder"] if len(skipped) >= 1: question.append("(") question.append(' '.join(skipped)) question.append(")") question.append(".") if len(question) >= 1 and not question[0].isupper(): question[0] = question[0].capitalize() return ' '.join(question)
def split_in_sentences(text): assert (isinstance(text, str)) nlp = SLoader.get_full_spacy_nlp() return list(nlp(text).sents)