def test_tenses(self): # Assert tense of "am". self.assertTrue(en.PRESENT_1ST_PERSON_SINGULAR in en.tenses("am")) self.assertTrue("1sg" in en.tenses("am")) self.assertTrue("1sg" in en.tenses("will")) self.assertTrue("2sg-" in en.tenses("won't")) self.assertTrue("g" in en.tenses("imaginarifying")) print "pattern.en.tenses()"
def tenseses(self, strs): if 'present' in str(tenses(strs)): ten = 'PRESENT' elif 'past' in tenses(strs): ten = "PAST" else: ten = "FUTURE" return ten
def test_tenses(self): # Assert tense of "am". self.assertTrue((en.PRESENT, 1, en.SINGULAR) in en.tenses("am")) self.assertTrue("1sg" in en.tenses("am")) self.assertTrue("1sg" in en.tenses("will")) self.assertTrue("2sg-" in en.tenses("won't")) self.assertTrue("part" in en.tenses("imaginarifying")) print "pattern.en.tenses()"
def mangle_agreement(correct_sentence): """Given a correct sentence, return a sentence or sentences with a subject verb agreement error""" # # Examples # # Back in the 1800s, people were much shorter and much stronger. # This sentence begins with the introductory phrase, 'back in the 1800s' # which means that it should have the past tense verb. Any other verb would # be incorrect. # # # Jack and jill went up the hill. # This sentence is different; 'go' would also be correct. If it began with # 'Yesterday', a single-word introductory phrase requiring no comma, only # 'went' would be acceptable. # # # The man in the checkered shirt danced his warrior dance to show that # he was the most dominant male in the room. # This sentence has multiple verbs. If the sentence ended at the word dance, # changing 'danced' to 'dances' would be acceptable, but since the sentence # continues we cannot make this change -- 'was' agrees with 'danced' but not # with 'dances'. This is a shifty tense error, a classic subject verb # agreement error. # # # Our Method # # Right now, we will assume that any change in verb form of a single verb in # a sentence is incorrect. As demonstrated above, this is not always true. # We hope that since any model created off of this data will use a # confidence interval to determine likelihood of a subject-verb agreement # error, that some number can be found for which the model excels. # # It would also be possible to use a rule based learner to evaluate single # verb sentences, and only evaluating more complex sentences with the # tensorflow model. bad_sents = [] doc = nlp(correct_sentence) verbs = [(i, v) for (i, v) in enumerate(doc) if v.tag_.startswith('VB')] for i, v in verbs: for alt_verb in lexeme(doc[i].text): if alt_verb == doc[i].text: continue # Same as the original, skip it if (tenses(alt_verb) == tenses(v.text) or (alt_verb.startswith(v.text) and alt_verb.endswith("n't"))): continue # Negated version of the original, skip it new_sent = str(doc[:i]) + " {} ".format(alt_verb) + str( doc[i + 1:]) new_sent = new_sent.replace(' ,', ',') # fix space before comma bad_sents.append(new_sent) return bad_sents
def _interface(sentence,edblist): target_words, word_pre, person_taggers, org_taggers = _Stem(sentence, edblist) token_list =[] #import pdb; pdb.set_trace() #print "word_pre:", word_pre if len(word_pre) > 0: word_pre[0] = word_pre[0][0].upper() + word_pre[0][1:] #import pdb; pdb.set_trace() for word in word_pre: #import pdb; pdb.set_trace() tokens = {} #if word == "He": # is a person, subject? # tokens[word] = ["He", "She"] if word.strip().lower() == person_taggers.strip().lower(): tokens[word] = [word, "He", "She"] #tokens[word] = [ "They"] elif word.strip().lower() == org_taggers.strip().lower(): if _isplural(org_taggers.strip().split()[-1]) or (org_taggers.strip().split()[-1] == 'they'): tokens[word] = [word, "They"] else: tokens[word] = [word, "It"] #tokens[word] = [ "It"] # pass else: if lmtzr.lemmatize(word) not in target_words: token_list.append(word) else: r_sent = [] candidates = Generate_candidates_topN(word,sentence,19,edblist) for i in range(len(candidates)): r_sent.append(candidates[i] + "@" + sentence.replace(word,candidates[i])) sub_top10 = kenlm_topn(r_sent,9,sentence) if lmtzr.lemmatize(word) not in sub_top10: sub_top10.insert(0,word) if len(tenses(word)) > 0: _sub_top10 = [] for w in sub_top10: _sub_top10.append(conjugate(w, tenses(word)[0][0], 3)) tokens[word] = _sub_top10 else: tokens[word] = sub_top10 if tokens: token_list.append(tokens) return token_list
def _transform_word(self, word, pos, less, more): """transforms a word to be less less and more more :param word: word to transform :type word: str :param pos: part of speech of the word :type pos: str :param less: list of 'less' words :type less: list :param more: list of 'more' words :type more: list :returns: transformed word :rtype: str """ new_word = self._get_similar_word(word, less, more) new_pos = en.tag(new_word)[0][1] if (pos[:2] != new_pos[:2]) or word == new_word: return word # handle noun if pos.startswith('NN'): # pluralization if pos.endswith('S') and not new_pos.endswith('S'): new_word = en.pluralize(new_word) elif not pos.endswith('S') and new_pos.endswith('S'): new_word = en.singularize(new_word) # capitalization if word[0].isupper(): new_word = new_word[0].upper() + new_word[1:] else: new_word = new_word.lower() # handle verb elif pos.startswith('VB'): tense, person, number = en.tenses(word)[0][:3] # conjugation conjugated = en.conjugate(new_word, tense=tense, person=person, number=number, parse=False) if conjugated is not None: new_word = conjugated # remove underscores for joint words new_word = new_word.replace('_', ' ') return new_word
def run_postprocessing(s, rules, all_args): rule_list = rules.split(',') for rule in rule_list: if rule == 'lower': s = s.lower() elif rule.startswith('tense-'): ind = int(rule[6:]) orig_vb = all_args[ind] if " " in orig_vb: orig_vb.split()[0] tenses = patten.tenses(orig_vb) for tense in PATTERN_TENSES: # Prioritize by PATTERN_TENSES if tense in tenses: break else: # Default to first tense tense = PATTERN_TENSES[0] if " " in s: s_verb = s.split()[0] s_verb_conj = patten.conjugate(s_verb, tense) s = " ".join([s_verb_conj] + s.split()[1:]) else: s = patten.conjugate(s, tense) elif rule in POS_TO_PATTERN: s = patten.conjugate(s, POS_TO_PATTERN[rule]) return s
def verb_fom(word: str) -> str: # Step 1: check if the word is in present tense or past. tense_list = tenses(word) if tense_list is None or len(tense_list) == 0: return word present_tense = True for index, i in enumerate(tense_list): if i[0] == 'present': present_tense = True final_tense = i break if i[0] == 'past': present_tense = False final_tense = i break if present_tense: tense_string = "past" else: tense_string = "present" # Step 2: Create new_word = conjugate(word, tense=tense_string, person=final_tense[1], number=final_tense[2], negated=False) # Step 3: Return the word return new_word if new_word is not None else word
def make_thesaurus(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: for line in f: # Ignore repeated book title headers if _is_title(line): continue parsed = parse(line) for tagged_word in parsed.split()[0]: word = tagged_word[0].strip().lower() pos = tagged_word[1][0] # get pos for word # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # Reject whitespace character if re.match("^[\s]*$", word): continue # Increment word count of word w thesaurus[word].update([word]) # Retrieve syn = synonym[w], add to thesaurus[syn] for syn in wn.get_synonyms(word): syn = syn.name().split(".")[0] # if noun, add plural form if word is plural, else add singular if pos == "N": if word == pluralize(word): thesaurus[pluralize(syn)].update([word]) else: thesaurus[syn].update([word]) # if verb, conjugate synonyms to the right form before adding them to thes elif pos == "V": word_tenses = tenses(word) if word_tenses: thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word]) else: thesaurus[syn].update([word]) else: thesaurus[syn].update([word]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def find_verb_form(original_form, original_lemma, new_lemma): """ Figure out original tense of the verb, then apply that tense to new_lemma There might be more than one, let's keep it simple and just apply the first one """ possible_conjugations = tenses(original_form) if len(possible_conjugations) > 1: return conjugate(new_lemma, possible_conjugations[1]) else: return conjugate(new_lemma, possible_conjugations[0])
def get_verb_reduction(verb, tag): """Given string of existing verb, returns its corresponding reduction That's the verb itself if its lemma is in the top100, else its hash""" if lemma(verb.lower()) in literals.verbs: return verb.upper() if lemma(verb.lower()) in top100.verbs: return verb.upper() else: h = sha256(str(tenses(verb)).encode('utf_8')).hexdigest() result = tag + '_' + h return result
def make_thesaurus_lesk(file_path): """ Returns dict of counters 'thesaurus', where thesaurus[synset] = { word1: 4, word2: 8, word3: 1, ... } """ thesaurus = defaultdict(lambda: Counter()) with open(file_path, "r") as f: f = f.read().split() for i, word_and_tag in enumerate(f): word, tag = word_and_tag.rsplit("_", 1) # Reject non-ASCII characters try: word = word.decode("ascii") except (UnicodeDecodeError, UnicodeEncodeError): continue # look at a window of 9 words each time lesk is called window = [i - WINDOW, i + WINDOW] if i < WINDOW: window = [i, i + 2 * WINDOW] elif i >= len(f) - WINDOW: window = [i - 2 * WINDOW, i] synset = lesk.my_lesk(f[window[0] : window[1]], word) # if lesk can decide on a meaning for that word, add # that meaning, i.e., that synset, to thesaurus if not synset: continue # if word is verb, only add present tense to thesaurus if tag[0] == "V": word_tenses = tenses(word.lower()) if "inf" in word_tenses or "1sg" in word_tenses or "2sg" in word_tenses or "3sg" in word_tenses: thesaurus[str(synset)].update([word.lower()]) elif tag[0] == "N": synset_name = synset.name().split(".")[0] if synset_name == pluralize(synset_name): thesaurus[str(synset)].update([pluralize(word.lower())]) else: thesaurus[str(synset)].update([singularize(word.lower())]) else: thesaurus[str(synset)].update([word.lower()]) # Update thesaurus with mappings, if map_file exists file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER) map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG) thesaurus = _add_mappings(map_file, thesaurus) return thesaurus
def Bullet_Replace(old_word, new_word, bullet, POS_tag): ''' This function replaces the old_word in bullet with new_word using POS_tag to make the forms of the words match ''' if POS_tag == 'VBD': #verb is past tense if '3sgp' in tenses(old_word): #3rd person singular past new_word = conjugate(new_word, '3sgp') else: #plural past new_word = conjugate(new_word, 'ppl') elif POS_tag == 'VBG': #gerund/present participle new_word = conjugate(new_word, 'part') elif POS_tag == 'VBN': #past participle new_word = conjugate(new_word, 'ppart') elif POS_tag == 'VBP': if '1sg' in tenses(old_word): #1st person singular new_word = conjugate(new_word, '1sg') else: #2nd person singular new_word = conjugate(new_word, '2sg') elif POS_tag == 'VBZ': if '3sg' in tenses(old_word): new_word = conjugate(new_word, '3sg') else: new_word = conjugate(new_word, 'pl') elif POS_tag in ['NNS', 'NNPS']: #need to make new word plural new_word = pluralize(new_word) #check for capitalization if old_word[0] != old_word[0].lower(): new_word = new_word[0].upper() + new_word[1:] return (bullet.replace(old_word, new_word))
def filtrar_conjugaciones(verbo, conjugaciones): conjugaciones = [x for x in conjugaciones if "n't" not in x and "not" not in x] if len(conjugaciones) <= 4: return conjugaciones else: if verbo['pos_tag'] in POS_TAGS_PRESENTE: conjugaciones = [x for x in conjugaciones if PAST not in tenses(x)] elif verbo['pos_tag'] in POS_TAGS_PASADO: conjugaciones = [x for x in conjugaciones if is_not_present(x)] if len(conjugaciones) >= 4: conjugaciones.remove(omitir_contraccion(verbo['token'])) conjugaciones = random.sample(conjugaciones, 3) conjugaciones.append(verbo['token']) return conjugaciones
def get_questions(self): z = self.getText() (subj,vp) = (z['NP'][0], z['VP'][0]) from pattern.en import lexeme, lemma, tenses import nltk, re tagged = nltk.pos_tag(nltk.word_tokenize(subj + " " + vp)) verb = "" sense = supersense(subj) if(sense[0][2][-6:] == 'person' or sense[0][1] == 'PRP'): return ("Who " + vp + "?") elif(sense[0][2][-4:] == 'time' or re.match("[1|2]\d\d\d", subj)): return ("When " + vp + "?") elif(sense[0][2][-8:] == 'location' and ('PP' in z and z['PP'].split()[0].lower in ["on", "in", "at", "over", "to"])): return ("Where " + vp + "?") aux = ["Will","Shall","May","Might","Can","Could","Must","Should","Would","Do","Does","Did"] for i in reversed(tagged): if(i[1][0] == 'V'): verb = i[0] if((u'' + verb) in lexeme("is")): return (verb.capitalize() + " " + subj.lower() + vp[len(verb):] + "?") else: for x in aux: if(tenses(x)[0] == tenses(verb)[0]): return (x + " " + subj.lower() + " " + lemma(verb) + vp[len(verb):] + "?")
def do_q(self, parse_by_structure, verb_index, np_index): verb = parse_by_structure[verb_index] (tense, person, a, b, c) = tenses(verb)[0] present_verb = str(conjugate(verb, tense="present", person=1)) sent = parse_by_structure sent[verb_index] = present_verb if tense == 'past': sent.insert(np_index, "did") elif tense == 'present' and person == 3: sent.insert(np_index, "does") else: sent.insert(np_index, "do") sent[-1] = "?" sent = " ".join(sent) return sent
def transform_word(word, pos, word_original): words = word.split(' ') result = list() for i, word in enumerate(words): if i == 0: try: if pos == 'JJR' or pos == 'RBR': pos_again = nltk.pos_tag([word])[0][1] if pos_again == 'JJR' or pos_again == 'RBR': result.append(word) else: result.append(comparative(word)) elif pos == 'JJS' or pos == 'RBS': pos_again = nltk.pos_tag([word])[0][1] if pos_again == 'JJS' or pos_again == 'RBS': result.append(word) else: result.append(superlative(word)) elif pos == 'NNS' or pos == 'NNPS': pos_again = nltk.pos_tag([word])[0][1] if pos_again == 'NNS' or pos_again == 'NNPS': result.append(word) else: result.append(pluralize(word)) elif pos == 'VBD': result.append(conjugate(word, 'p')) elif pos == 'VBG': result.append(conjugate(word, 'part')) elif pos == 'VBN': result.append(conjugate(word, 'ppart')) elif pos == 'VBP': if (PRESENT, 1, SG) in tenses(word_original): result.append(conjugate(word, '1sg')) else: result.append(conjugate(word, '2sg')) elif pos == 'VBZ': result.append(conjugate(word, '3sg')) else: result.append(word) except KeyError: result.append(word) else: result.append(word) return ' '.join(result)
def match_all_inflections(source_word, target_words, pos): if pos == wn.VERB: inflections = set() conjugations = tenses(source_word) for tense, person, number, mood, aspect in conjugations: inflections.update([ conjugate(word, tense=tense, person=person, number=number, mood=mood, aspect=aspect) for word in target_words ]) return inflections elif pos == wn.NOUN: return [singularize(word) for word in target_words ] + [pluralize(word) for word in target_words] else: # pos == "ADJ" or pos == "ADV" return target_words
def change_tense(token): nt = tenses(token) #print(nt) if len(nt) != 0: current_tense = nt[0][0] else: return token #if any(isinstance(i, tuple) for i in nt): # current_tense = nt[0][0] #else: # current_tense = nt[0] p_conj = [] for p in range(1, 4): n_conj = conjugate(token, tense=current_tense, person=p) if n_conj != token: return n_conj p_conj.append(n_conj) return p_conj[0]
def fix_vp(np, vp): verb = detokenizer.detokenize(vp) tnss = tenses(verb) if np == ['i']: tns = [a for a in tnss if 2 in a][0] return [ conjugate(verb, tense=tns[0], person=1, number=tns[2], mood=tns[3], aspect=tns[4]) ] if np == ['you']: tns = [a for a in tnss if 1 in a][0] return [ conjugate(verb, tense=tns[0], person=2, number=tns[2], mood=tns[3], aspect=tns[4]) ] return vp
print article('hour') print referenced('university') print referenced('hour') #singularity print pluralize('child') print singularize('wolves') # print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print (PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s)
def loop(self, debug=False): w_inv_orig = self.tsv('inv_orig.tsv') w_inv_trsf = self.tsv('inv_trsf.tsv') w_pass_orig = self.tsv('pass_orig.tsv') w_pass_trsf = self.tsv('pass_trsf.tsv') self.lines = open(mnli_train).readlines() already_seen = set() self.dicts = [] n = 0 for i, line in enumerate(self.lines): j = json.loads(line) self.dicts.append(j) if i % 10000 == 0: print('%d out of %d' % (i, len(self.lines))) if debug and i == 10000: break if j['genre'] == 'telephone': continue tree = j['hyptree'] = nltk.tree.Tree.fromstring(j['sentence2_parse']) ss = [x for x in tree.subtrees() if x.label() == 'S'] for s in ss[:1]: if len(s) < 2: # Not a full NP + VP sentence continue subj_head = self.get_np_head(s[0]) if subj_head is None: continue subject_number = self.get_np_number(s[0]) k = 1 while (s[k].label() not in (u'VP', u'SBAR', u'ADJP')) and (k < len(s) - 1): k+=1 if k == len(s) - 1: continue #iterate through top level branches to find VP vp_head = self.get_vp_head(s[k]) if vp_head[0] is None: continue subj = ' '.join(s[0].flatten()) arguments = tuple(x.label() for x in s[1][1:]) if (arguments != ('NP',) or en.lemma(vp_head[0]) in ['be', 'have']): continue direct_object = ' '.join(s[1][1].flatten()) object_number = self.get_np_number(s[1][1]) if object_number is None: # Personal pronoun, very complex NP, or parse error continue lookup = en.tenses(vp_head[0]) if len(lookup) == 0: if vp_head[0][-2:]: tense = en.PAST else: tense = en.PRESENT else: if en.tenses(vp_head[0])[0][0] == u'past': tense = en.PAST else: tense = en.PRESENT subjobj_rev_hyp = ' '.join([ upper_first(direct_object), #keep tense en.conjugate(vp_head[0], number=object_number, tense = tense), lower_first(subj)]) + '.' passive_hyp_same_meaning = ' '.join([ upper_first(direct_object), self.passivize_vp(s[k], object_number), lower_first(subj)]) + '.' passive_hyp_inverted = ' '.join([ subj, self.passivize_vp(s[k], subject_number), direct_object]) + '.' if j['gold_label'] == 'entailment': self.mnli_row(w_inv_orig, 1000000 + n, j['sentence1'], subjobj_rev_hyp, 'neutral') self.mnli_row(w_inv_trsf, 1000000 + n, j['sentence2'], subjobj_rev_hyp, 'neutral') self.mnli_row(w_pass_orig, 1000000 + n, j['sentence1'], passive_hyp_same_meaning, j['gold_label']) self.mnli_row(w_pass_trsf, 1000000 + n, j['sentence2'], passive_hyp_inverted, 'neutral') self.mnli_row(w_pass_trsf, 2000000 + n, j['sentence2'], passive_hyp_same_meaning, 'entailment') n += 1
print article('university') print article('hour') print referenced('university') print referenced('hour') #singularity print pluralize('child') print singularize('wolves') # print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print(PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s)
def test_tenses(self): # Assert tense of "am". self.assertTrue(en.PRESENT_1ST_PERSON_SINGULAR in en.tenses("am")) self.assertTrue("1sg" in en.tenses("am")) print "pattern.en.tenses()"
# The comparative() and superlative() commands give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print word, "=>", comparative(word), "=>", superlative(word) print print # VERB CONJUGATION # ---------------- # The lexeme() command returns a list of all possible verb inflections. # The lemma() command returns the base form (infinitive) of a verb. print "lexeme:", lexeme("be") print "lemma:", lemma("was") # The conjugate() command inflects a verb to another tense. # The tense can be given as a constant, e.g. # INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ... # or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. print conjugate("being", tense="1sg", negated=False) # Prefer the full constants for code that will be reused/shared. # The tenses() command returns a list of all tenses for the given verb form. # For example: tenses("are") => ['present 2nd person singular', 'present plural'] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT_PLURAL print tenses("are") print PRESENT_PLURAL in tenses("are") print "pl" in tenses("are")
def is_not_present(conjugacion): return (PRESENT, 1) not in tenses(conjugacion) and (PRESENT, 2) not in tenses(conjugacion) and (PRESENT, 3) not in tenses(conjugacion)
def makeSameTense(self, w1, w2): tense = count([i[0] for i in tenses(w2)], stopwords=True) tense = sorted(tense, key=operator.itemgetter(2)) return verbs.conjugate(w1, tense[0])
def process_text(text): """ Uses NLP to get passive voice sentences, sentences with adverbs, and progressive tense sentences. Works ok so far, but not great -- lots of false positives. """ doc = nlp(text) sents = list(doc.sents) print("Number of Sentences = ", len(sents)) # gives match_id, start, end matches = matcher(doc) df = pd.DataFrame(matches, columns=['id', 'start', 'end']) df.drop_duplicates(inplace=True) print(df.shape[0], 'passive sentences detected') # print out passive phrases for i, r in df.iterrows(): print(doc[r['start']:r['end']]) # progressive tense detection # get verb phrase from sentences and check if progressive verb_clause_pattern = r'<VERB>+<ADV>*<PART>*<VERB>*<PART>*' progressive_sentences, progressive_verb_clauses = [], [] adverb_sentences, adverbs = [], [] for s in sents: verb_clauses = list(textacy.extract.pos_regex_matches(s, verb_clause_pattern)) for v in verb_clauses: if len(v) > 1: # need to have some helper verbs to have a problem verb_tenses = pd.DataFrame(list(tenses(v.text))) if 'progressive' in verb_tenses.iloc[:, -1].tolist(): # last column progressive_sentences.append(s) progressive_verb_clauses.append(v) # adverb detection # "...the road to hell is paved with adverbs..." -- Stephen King pos = np.array([w.pos_ for w in s]) adverb_idxs = np.argwhere(pos == 'ADV').flatten() if len(adverb_idxs) != 0: adverb_sentences.append(s) adverbs.append([s[a] for a in adverb_idxs]) # print out sentences with adverbs and the list of adverbs print('\n\n\nADVERBS') print(len(adverb_sentences), 'sentences detected with adverbs\n') if len(adverb_sentences) > 0: for i, adv in enumerate(adverb_sentences): print(adv) print('has adverbs:', adverbs[i], '\n') # print out sentences with possible progressive verb clauses print('\n\n\nPROGESSIVE TENSES') print(len(progressive_sentences), 'sentences detected with progressive verb clauses\n') if len(progressive_sentences) > 0: for i, adv in enumerate(progressive_sentences): print(adv) print('has progressive verb clause(s):', progressive_verb_clauses[i], '\n')
def simp_parti_sent(tokens, node_list): """ # the original tokens in the sent tokens = StanfordTokenizer().tokenize(sent) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) """ #start_time = time.time() root = "" root_ind = node_list[0][4]['root'][0] for nd in node_list: if root_ind == nd[0]: root=nd[1] """ taggers = [] for nd in node_list[1:]: taggers.append((nd[1], nd[2])) """ strs = "" #split_ind = 0 for nd in node_list[1:]: #import pdb; pdb.set_trace() #print(nd) if (root in nd) and ('nsubj' in nd[4].keys()): pass if (root in nd) and ('nsubj' in nd[4].keys()): #print "conj: ", nd #print "conj node: ", nd[4]['conj'] #import pdb; pdb.set_trace() nsubj_ind = nd[4]['nsubj'][0] nsubj_dict = {} nsubj_compound_list = [] for _nd in node_list: #BUG if nsubj_ind == _nd[0]: nsubj_dict = _nd[4] if ('compound' in nsubj_dict.keys()): nsubj_compound_list = nsubj_dict['compound'] break #import pdb; pdb.set_trace() if ('acl' in nsubj_dict.keys()): #[NOTICE]: connect the nsubj + acl as 1st # And the 1st end in the PUNC #import pdb; pdb.set_trace() acl_ind = nsubj_dict['acl'][0] #[NOTICE]: end the 1st sentence at the 'punc' place after acl_ind # this assumation is wrong """ for punc in PUNCTUATION: if punc in tokens[acl_ind:]: split_ind = tokens[acl_ind:].index(punc) break """ #subj = tokens[nsubj_ind] #import pdb; pdb.set_trace() nsubj = "" for i in nsubj_compound_list: nsubj = nsubj + " " + tokens[i] nsubj = nsubj + " " + tokens[nsubj_ind] nsubj = nsubj[0].upper() + nsubj[1:] + " " #tokens.insert(1, upper_first_char(subj)) """ person_taggers = [] org_taggers = [] # replace the nsubj with "he/she" for token, title in taggers: if token in nsubj: if title == 'PERSON': person_taggers.append(token) elif title == 'ORGANIZATION': org_taggers.append(token) else: org_taggers.append(token) """ #import pdb; pdb.set_trace() #verb = "be" verb = conjugate("was", tenses(root)[0][0], 3) root_ind = tokens.index(root) advmod_ind = 0 for _nd in node_list[1:]: if acl_ind == _nd[0]: acl_dict = _nd[4] break if ('advmod' in acl_dict.keys()): advmod_ind = acl_dict['advmod'][0] if advmod_ind == 0: _str1 = tokens[acl_ind:root_ind] else: if advmod_ind > acl_ind: _str1 = tokens[acl_ind:root_ind] else: _str1 = tokens[advmod_ind:root_ind] if len(_str1) > 0 and _str1[-1] in PUNCTUATION: _str1[-1] = '' #str1 = base.upper_first_char(nsubj) + " " + verb + " " str1 = nsubj + " " + verb + " " str1 = str1 + ' '.join(_str1) #print "1st sent: ", str1 # upper the 1st char in 2nd sent #import pdb; pdb.set_trace() _strs = tokens[root_ind:] _str2 = " ".join(_strs) """ if len(person_taggers) > 0: str2 = "He" + " " + ' '.join(_str2) # 'he' will be replaced with 'he/she' elif len(org_taggers) > 0: if base.isplural(org_taggers[-1]): str2 = "They" + " " + ' '.join(_str2) else: str2 = "It" + " " + ' '.join(_str2) else: str2 = nsubj + ' '.join(_str2) """ nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + " " + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 #w = _w + ' ' #str2 = base.upper_first_char(nsubj) + " " + ' '.join(_str2) #print "2nd sent: ", str2 strs = str1 + ' . ' + str2 #return strs #if ('acl' in nsubj_dict.keys()): #end_time = time.time() #during_time = end_time - start_time #print "The time of parti function: ", during_time return strs
def simp_appos_sent(tokens, node_list): """ strs = "" # the original tokens in the sent tokens = StanfordTokenizer().tokenize(sent) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) """ start_time = time.time() root = "" root_ind = node_list[0][4]['root'][0] for nd in node_list: if root_ind == nd[0]: root=nd[1] """ taggers = [] for nd in node_list[1:]: taggers.append((nd[1], nd[2])) """ strs = "" #split_ind = 0 for nd in node_list[1:]: #import pdb; pdb.set_trace() #print(nd) if (root in nd) and ('nsubj' in nd[4].keys() or ('nsubjpass' in nd[4].keys())): pass if (root in nd) and ('nsubj' in nd[4].keys() or ('nsubjpass' in nd[4].keys())): #print "conj: ", nd #print "conj node: ", nd[4]['conj'] nsubj = "" nsubj_ind = 0 nsubj_nmod_ind = 0 nsubj_dict = {} #import pdb; pdb.set_trace() if ('nsubj' in nd[4].keys()): nsubj_ind = nd[4]['nsubj'][0] nsubj_compound_list = [] #nsubj_nmod_ind = 0 for _nd in node_list[1:]: #BUG if nsubj_ind == _nd[0]: nsubj_dict = _nd[4] if ('compound' in nsubj_dict.keys()): nsubj_compound_list = nsubj_dict['compound'] #break if ('nmod' in nsubj_dict.keys()): nsubj_nmod_ind=nsubj_dict['nmod'][0] #import pdb; pdb.set_trace() for i in nsubj_compound_list: nsubj = nsubj + " " + tokens[i] cop_ind = 0 for _nd in node_list[1:]: if (root in _nd) and ('cop' in _nd[4].keys()): cop_ind = _nd[4]['cop'][0] # get the nsubj #import pdb; pdb.set_trace() auxpass_ind = 0 if ('nsubjpass' in nd[4].keys()): nsubj_ind = nd[4]['nsubjpass'][0] for _nd in node_list: if root_ind == _nd[0] and ('auxpass' in _nd[4].keys()): auxpass_ind = nd[4]['auxpass'][0] if nsubj_nmod_ind != 0: #BUG here nsubj = " ".join(tokens[nsubj_ind:nsubj_nmod_ind+1]) else: nsubj = nsubj + " " + tokens[nsubj_ind] nsubj = nsubj.strip() nsubj = nsubj[0].upper() + nsubj[1:] + " " """ person_taggers = [] org_taggers = [] # replace the nsubj with "he/she" for token, title in taggers: if token in nsubj: if title == 'PERSON': person_taggers.append(token) elif title == 'ORGANIZATION': org_taggers.append(token) else: org_taggers.append(token) """ #import pdb; pdb.set_trace() if len(nsubj_dict)>0 and ('appos' in nsubj_dict.keys()): #[NOTICE]: connect the nsubj + acl as 1st #import pdb; pdb.set_trace() appos_ind = nsubj_dict['appos'][0] #verb = "is" verb = conjugate("was", tenses(root)[0][0], 3) #verb = base.update_vb_conjugation(verb, root) #nsubj = base.upper_first_char(tokens[nsubj_ind]) #[NOTICE]: remove the ',' after the nsubj if tokens[nsubj_ind + 1] in PUNCTUATION: tokens[nsubj_ind + 1] = '' #tokens.insert(nsubj_ind + 1, verb) root_ind = tokens.index(root) # SO bad solution, if the root isnot a 'verb' split_ind = 0 if ',' in tokens: split_ind = tokens.index(',') for nd in node_list[1:]: #BUG if nsubj_ind == nd[0]: nsubj_dict = _nd[4] if ('compound' in nsubj_dict.keys()): nsubj_compound_list = nsubj_dict['compound'] #break if ('nmod' in nsubj_dict.keys()): nsubj_nmod_ind=nsubj_dict['nmod'][0] #import pdb; pdb.set_trace() if tokens[root_ind] > split_ind: if nsubj_nmod_ind != 0 and cop_ind !=0: _str1 = tokens[split_ind:cop_ind] else: _str1 = tokens[nsubj_ind+1:split_ind] tokens[split_ind] = '' if len(_str1) > 0 and (_str1[-1] in PUNCTUATION): _str1[-1] = '' if len(_str1) >0 and (_str1[0] in PUNCTUATION): _str1[0] = '' str1 = nsubj + " " + verb + ' '.join(_str1) if nsubj_nmod_ind != 0 and cop_ind !=0: _strs = tokens[cop_ind:] else: _strs = tokens[split_ind:] _str2 = " ".join(_strs) """ if len(person_taggers) > 0: str2 = "He" + " " + ' '.join(_str2) # 'he' will be replaced with 'he/she' elif len(org_taggers) > 0: if base.isplural(org_taggers[-1]): str2 = "They" + " " + ' '.join(_str2) else: str2 = "It" + " " + ' '.join(_str2) else: str2 = nsubj + ' '.join(_str2) """ nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 else: #import pdb; pdb.set_trace() _str1 = tokens[nsubj_ind+1:root_ind] if len(_str1) > 0 and _str1[-1] in PUNCTUATION: _str1[-1] = '' str1 = nsubj + ' '.join(_str1) #print "1st sent: ", str1 # upper the 1st char in 2nd sent _strs = tokens[root_ind:] _str2 = " ".join(_strs) """ if len(person_taggers) > 0: str2 = "He" + " " + ' '.join(_str2) # 'he' will be replaced with 'he/she' elif len(org_taggers) > 0: if base.isplural(org_taggers.split()[-1]): str2 = "They" + " " + ' '.join(_str2) else: str2 = "It" + " " + ' '.join(_str2) else: str2 = nsubj + ' '.join(_str2) """ #w = _w + ' ' #str2 = nsubj + ' '.join(_str2) #print "2nd sent: ", str2 nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 strs = str1 + ' . ' + str2 #import pdb; pdb.set_trace() end_time = time.time() during_time = end_time - start_time print "The time of appos function: ", during_time return strs if auxpass_ind > 0: split_ind = 0 if ',' in tokens: split_ind = tokens.index(',') if split_ind == 0: return strs #import pdb; pdb.set_trace() #verb = conjugate("be", tenses(root)[0][0], 3) verb = tokens[auxpass_ind] if tokens[root_ind] > split_ind: _str1 = tokens[nsubj_ind+1:auxpass_ind] if len(_str1) > 0 and _str1[-1] in PUNCTUATION: _str1[-1] = '' if len(_str1) > 0 and _str1[0] in PUNCTUATION: _str1[0] = '' str1 = nsubj + " " + verb + ' '.join(_str1) #print "1st sent: ", str1 # upper the 1st char in 2nd sent _strs = tokens[auxpass_ind:] _str2 = " ".join(_strs) nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 strs = str1 + ' . ' + str2 return strs #import pdb; pdb.set_trace() end_time = time.time() during_time = end_time - start_time print "The time of appos function: ", during_time return strs
def pass_act_detect(doc): parse = nlp(doc) newdoc = '' for sent in parse.sents: # no meaning, test only take one sentence at a time. # Init parts of sentence to capture: subjpass = '' subj = '' verb = '' verbaspect = '' verbtense = '' adverb = {'bef': '', 'aft': ''} part = '' prep = '' agent = '' aplural = False advcltree = None aux = list(list(nlp('. .').sents)[0]) # start with 2 'null' elements xcomp = '' punc = '.' # Analyse dependency tree: for word in sent: if word.dep_ == 'advcl': if word.head.dep_ in ('ROOT', 'auxpass'): advcltree = word.subtree if word.dep_ == 'nsubjpass': if word.head.dep_ == 'ROOT': subjpass = ''.join( w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'nsubj': subj = ''.join(w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.head.dep_ == 'auxpass': if word.head.head.dep_ == 'ROOT': subjpass = subj if word.dep_ in ('advmod', 'npadvmod'): if word.head.dep_ == 'ROOT': if verb == '': adverb['bef'] = ''.join( w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() else: adverb['aft'] = ''.join( w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'auxpass': if word.head.dep_ == 'ROOT': if not subjpass: subjpass = subj if word.dep_ in ('aux', 'auxpass', 'neg'): if word.head.dep_ == 'ROOT': aux += [word] if word.dep_ == 'ROOT': verb = word.text if word.tag_ == 'VB': verbtense = en.INFINITIVE elif word.tag_ == 'VBD': verbtense = en.PAST elif word.tag_ == 'VBG': verbtense = en.PRESENT verbaspect = en.PROGRESSIVE elif word.tag_ == 'VBN': verbtense = en.PAST else: verbtense = en.tenses(word.text)[0][0] if word.dep_ == 'prt': if word.head.dep_ == 'ROOT': part = ''.join(w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'prep': if word.head.dep_ == 'ROOT': prep = ''.join(w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_.endswith('obj'): if word.head.dep_ == 'agent': if word.head.head.dep_ == 'ROOT': agent = ''.join( w.text + ', ' if w.dep_ == 'appos' else ( w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws) for w in word.subtree).strip() aplural = word.tag_ in ('NNS', 'NNPS') if word.dep_ in ('xcomp', 'ccomp', 'conj'): if word.head.dep_ == 'ROOT': xcomp = ''.join(w.text_with_ws.lower() if w.tag_ not in ( 'NNP', 'NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'punct': punc = word.text # exit if not passive: if subjpass == '': newdoc += str(sent) + ' ' return False #active return True #passive
# The lemma() function returns the base form (infinitive) of a verb. print("lexeme: %s" % lexeme("be")) print("lemma: %s" % lemma("was")) print("") # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)) print(conjugate("being", tense="1sg", negated=False)) print("") # Prefer the full constants for code that will be reused/shared. # The tenses() function returns a list of all tenses for the given verb form. # Each tense is a tuple of (tense, person, number, mood, aspect). # For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT, PLURAL print(tenses("are")) print((PRESENT, 1, PLURAL) in tenses("are")) print("pl" in tenses("are"))
Sentence, Word, Chunk, PNPChunk, modality, wordnet, ADJECTIVE #indefinite article print referenced('university') print referenced('hour') # pluralization and singularization print pluralize('child') print singularize('wolves') # comparative and superlative print comparative('bad') print superlative('bad') # verb conjugation print lexeme('purr') print lemma('purring') print conjugate('purred', '3sg') # he / she / it print 'p' in tenses('purred') # By alias. print PAST in tenses('purred') print(PAST, 1, PL) in tenses('purred') # rule-based conjugation print 'google' in verbs.infinitives print 'googled' in verbs.inflections print conjugate('googled', tense=PARTICIPLE, parse=False) print conjugate('googled', tense=PARTICIPLE, parse=True) # quantification print number("seventy-five point two") # "seventy-five point two" => 75.2 print numerals(2.245, round=2) # 2.245 => "two point twenty-five" print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify({'carrot': 100, 'parrot': 20}) print quantify('carrot', amount=1000) # spelling print suggest("parot")
def simp_adverb_sent(_tokens, node_list): tokens = list(_tokens) strs = "" #import pdb; pdb.set_trace() if COMMA not in tokens: return strs root = "" root_ind = node_list[0][4]['root'][0] for nd in node_list: if root_ind == nd[0]: root=nd[1] #split_ind = 0 for nd in node_list[1:]: #import pdb; pdb.set_trace() #print(nd) if (root in nd) and ('advcl' in nd[4].keys() or 'xcomp' in nd[4].keys()): pass if (root in nd) and ('advcl' in nd[4].keys() or 'xcomp' in nd[4].keys() or 'advmod' in nd[4].keys()): #print "conj: ", nd #print "conj node: ", nd[4]['conj'] #import pdb; pdb.set_trace() nsubj = "" nsubj_ind = 0 det_ind = 0 if ('nsubj' in nd[4].keys()): nsubj_ind = nd[4]['nsubj'][0] nsubj_dict = {} nsubj_compound_list = [] amod_list = [] det_ind = 0 #import pdb; pdb.set_trace() for _nd in node_list: #import pdb; pdb.set_trace() if (nsubj_ind == _nd[0]): #import pdb; pdb.set_trace() nsubj_dict = _nd[4] if ('amod' in nsubj_dict.keys()): amod_list = nsubj_dict['amod'] if ('compound' in nsubj_dict.keys()): nsubj_compound_list = nsubj_dict['compound'] if ('det' in nsubj_dict.keys()): det_ind = nsubj_dict['det'][0] #break #nsubj = tokens[det_ind] + " " + tokens[nsubj_ind] for j in amod_list: nsubj = nsubj + " " + tokens[j] for i in nsubj_compound_list: nsubj = nsubj + " " + tokens[i] if det_ind > 0: nsubj = tokens[det_ind] + " " + nsubj + " " + tokens[nsubj_ind] else: nsubj = nsubj + " " + tokens[nsubj_ind] #import pdb; pdb.set_trace() nsubj = nsubj.strip() nsubj = nsubj[0].upper() + nsubj[1:] + " " #cop_ind = 0 if ('cop' in nd[4].keys()): cop_ind = nd[4]['cop'][0] #import pdb; pdb.set_trace() if ('nsubjpass' in nd[4].keys()): nsubj_ind = nd[4]['nsubjpass'][0] for _nd in node_list: #import pdb; pdb.set_trace() if (nsubj_ind == _nd[0]): #import pdb; pdb.set_trace() if ('det' in _nd[4].keys()): det_ind = _nd[4]['det'][0] nsubj = tokens[det_ind] + " " + tokens[nsubj_ind] nsubj = nsubj.strip() """ person_taggers = [] org_taggers = [] #import pdb; pdb.set_trace() # replace the nsubj with "he/she" for token, title in eng_tagger.tag(tokens): if token.lower() in nsubj.lower().split(): if token == 'the' or token == 'The': continue if title == 'PERSON': person_taggers.append(token) elif title == 'ORGANIZATION': org_taggers.append(token) else: org_taggers.append(token) """ #import pdb; pdb.set_trace() advcl_dict = {} advcl_tag = "" if ('advcl' in nd[4].keys()): advcl_ind = nd[4]['advcl'][0] #import pdb; pdb.set_trace() if len(tenses(root))>0: if tenses(root)[0][0] == 'infinitive': tokens[advcl_ind] = conjugate(tokens[advcl_ind], tenses(root)[1][0], 3) else: tokens[advcl_ind] = conjugate(tokens[advcl_ind], tenses(root)[0][0], 3) #TODO: update the tense of the advcl_ind #import pdb; pdb.set_trace() #advcl_dict = {} for _nd in node_list[1:]: #BUG if advcl_ind == _nd[0]: advcl_dict = _nd[4] advcl_tag = _nd[2] break # check the nsubj of the advcl, if they are the same subj, it is adverb advcl_dobj_ind = 0 for _nd in node_list[1:]: if advcl_ind == _nd[0]: if 'nsubj' in _nd[4].keys(): #import pdb; pdb.set_trace() advcl_nsubj_ind = _nd[4]['nsubj'][0] if tokens[advcl_nsubj_ind].lower() not in nsubj: return strs if 'dobj' in _nd[4].keys(): advcl_dobj_ind = _nd[4]['dobj'][0] #import pdb; pdb.set_trace() verb = 'was' #import pdb; pdb.set_trace() if len(tenses(root)) > 0: if nsubj.strip().lower() == 'they': if tenses(root)[0][0] == 'infinitive': verb = conjugate(verb, tenses(root)[1][0], 2) else: verb = conjugate(verb, tenses(root)[0][0], 2) else: if tenses(root)[0][0] == 'infinitive': verb = conjugate(verb, tenses(root)[1][0], 3) else: verb = conjugate(verb, tenses(root)[0][0], 3) # TODO, the tense if advcl_tag == 'VBN': if len(nsubj)>0: nsubj = nsubj[0].upper() + nsubj[1:] + " " if advcl_tag == 'VBG': if len(nsubj)>0: nsubj = nsubj[0].upper() + nsubj[1:] + " " #ASSUME ',' is the splitting tag # This assumation isnot right split_ind = tokens.index(COMMA) #nsubj_ind = nd[4]['nsubj'][0] #if (advcl_ind < split_ind): #subj = tokens[nsubj_ind] # tokens.insert(1, base.upper_first_char(subj)) #if len(tenses(root))>0: # tokens[advcl_ind]=conjugate(tokens[advcl_ind], tenses(root)[0][0]) # #import pdb; pdb.set_trace() if advcl_dobj_ind > split_ind: tokens[split_ind] = "" _str1 = tokens[split_ind:advcl_dobj_ind+1] else: #_str1 = "" _str1 = tokens[:(split_ind)] if _str1[-1] in PUNCTUATION: _str1[-1] = '' """ str1 = "" if advcl_tag == 'VBN': str1 = nsubj + ' '.join(_str1) if advcl_tag == 'VBG': str1 = ' '.join(_str1) """ #import pdb; pdb.set_trace() _str1_ = ' '.join(_str1) nsubj = ' '.join(nsubj.split()) str1 = "" if nsubj.lower() + ' ' in _str1_.lower().split(): str1 = _str1_ else: if advcl_tag == 'VBN': str1 = nsubj + " " + verb + " " + _str1_ else: str1 = nsubj + " " + _str1_ #print "1st sent: ", str1 # upper the 1st char in 2nd sent #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind]) #import pdb; pdb.set_trace() _str2 = "" str2 = "" if split_ind < nsubj_ind: #_str2 = tokens[split_ind+1:] _strs = tokens[root_ind:] if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()): _strs = tokens[split_ind+2:] _str2 = " ".join(_strs) #_str2 = tokens[root_ind:] #w = _w + ' ' """ if len(nsubj)>0: if (('it' not in nsubj.lower()) or ('They' not in nsubj.lower())): str2 = nsubj + " " + ' '.join(_str2) else: #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2) if len(person_taggers) > 0: str2 = "He" + " " + ' '.join(_str2) # 'he' will be replaced with 'he/she' elif len(org_taggers) > 0: if base.isplural(org_taggers[-1]) or (org_taggers[-1].lower() == 'they'): str2 = "They" + " " + ' '.join(_str2) else: str2 = "It" + " " + ' '.join(_str2) else: str2 = ' '.join(_str2) """ nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'She' or _nsubj == 'He': str2 = _nsubj + " " + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj = base.replace_nsubj(sent2, nsubj) #str2 = nsubj + _str2 str2 = _nsubj + " " + _str2 else: if advcl_dobj_ind > split_ind: _strs = tokens[advcl_dobj_ind+1:] if _strs[0] in PUNCTUATION: _strs[0] = '' else: _strs = tokens[split_ind+1:] if len(_str2)>0 and (('which' == _str2[0].lower()) or ('who' == _str2[0].lower())): _strs = tokens[split_ind+2:] _str2 = " ".join(_strs) nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'She' or _nsubj == 'He': str2 = _nsubj + " " + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj = base.replace_nsubj(sent2, nsubj) #str2 = nsubj + " " + _str2 str2 = _nsubj + " " + _str2 #print "2nd sent: ", str2 if str1: strs = str1 + ' . ' + str2 else: strs = str2 return strs #import pdb; pdb.set_trace() xcomp_ind = 0 if ('xcomp' in nd[4].keys()): xcomp_ind = nd[4]['xcomp'][0] if len(tenses(root))>0: tokens[xcomp_ind] = conjugate(tokens[xcomp_ind], tenses(root)[0][0], 3) #import pdb; pdb.set_trace() #advcl_dict = {} for _nd in node_list: #BUG if xcomp_ind == _nd[0]: xcomp_dict = _nd[4] xcomp_tag = _nd[2] break #if len(tenses(root)) > 0: # tokens[xcomp_ind]=conjugate(tokens[xcomp_ind], tenses(root)[0][0]) #import pdb; pdb.set_trace() verb = 'was' #import pdb; pdb.set_trace() if len(tenses(root)) > 0: if nsubj.strip().lower() == 'they': verb = conjugate(verb, tenses(root)[0][0], 2) else: verb = conjugate(verb, tenses(root)[0][0], 3) # TODO if xcomp_tag == 'VBN': nsubj = nsubj[0].upper() + nsubj[1:] + " " if xcomp_tag == 'VBG': nsubj = nsubj[0].upper() + nsubj[1:] + " " split_ind = tokens.index(COMMA) #nsubj_ind = nd[4]['nsubj'][0] #if (advcl_ind < split_ind): #subj = tokens[nsubj_ind] # tokens.insert(1, base.upper_first_char(subj)) _str1 = tokens[:(split_ind)] if _str1[-1] in PUNCTUATION: _str1[-1] = '' str1 = "" #import pdb; pdb.set_trace() nsubj = ' '.join(nsubj.split()) _str1_ = ' '.join(_str1) #if xcomp_tag == 'VBN': if nsubj.lower() + ' ' in _str1_.lower(): str1 = _str1_ else: if advcl_tag == 'VBN': str1 = nsubj + " " + verb + " " + _str1_ else: str1 = nsubj + " " + _str1_ """ #elif xcomp_tag == 'VBG': if nsubj.lower() in _str1_.lower(): str1 = _str1_ else: str1 = nsubj + _str1_ """ #print "1st sent: ", str1 # upper the 1st char in 2nd sent #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind]) #import pdb; pdb.set_trace() _str2 = "" str2 = "" if nsubj_ind < split_ind: _strs = tokens[split_ind+1:] if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()): _strs = tokens[split_ind+2:] _str2 = " ".join(_strs) #TODO: update the tense #_str2 = tokens[root_ind:] #_str2 = tokens[split_ind+1:] #w = _w + ' ' """ if len(nsubj)>0: if (('it' not in nsubj.lower()) or ('they' not in nsubj.lower())): str2 = nsubj + " " + ' '.join(_str2) else: #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2) if len(person_taggers) > 0: str2 = "He" + " " + ' '.join(_str2) # 'he' will be replaced with 'he/she' elif len(org_taggers) > 0: if base.isplural(org_taggers[-1]) or (org_taggers[-1].lower() == 'they'): str2 = "They" + " " + ' '.join(_str2) else: str2 = "It" + " " + ' '.join(_str2) else: str2 = ' '.join(_str2) """ #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2) nsubj = nsubj.strip() _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + " " + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 else: str2 = base.upper_first_char(nsubj) + " " + ' '.join(tokens[split_ind+2:]) #str2 = "That" + " " + ' '.join(_str2) #print "2nd sent: ", str2 if str1: if str2: strs = str1 + ' . ' + str2 else: strs = str1 + ' . ' else: strs = str2 + ' . ' return strs #import pdb; pdb.set_trace() advmod_ind = 0 if ('advmod' in nd[4].keys()): advmod_ind = nd[4]['advmod'][0] #if len(tenses(root))>0: # tokens[advmod_ind] = conjugate(tokens[advmod_ind], tenses(root)[0][0], 3) #import pdb; pdb.set_trace() #advcl_dict = {} for _nd in node_list: #BUG if advmod_ind == _nd[0]: advmod_dict = _nd[4] advmod_tag = _nd[2] break #if len(tenses(root)) > 0: # tokens[xcomp_ind]=conjugate(tokens[xcomp_ind], tenses(root)[0][0]) #import pdb; pdb.set_trace() verb = 'was' #import pdb; pdb.set_trace() if len(tenses(root)) > 0: if nsubj.strip().lower() == 'they': verb = conjugate(verb, tenses(root)[0][0], 2) else: verb = conjugate(verb, tenses(root)[0][0], 3) # TODO if nsubj: nsubj = nsubj.strip() nsubj = nsubj[0].upper() + nsubj[1:] split_ind = tokens.index(COMMA) #nsubj_ind = nd[4]['nsubj'][0] #if (advcl_ind < split_ind): #subj = tokens[nsubj_ind] # tokens.insert(1, base.upper_first_char(subj)) _str1 = tokens[:(split_ind)] if _str1[-1] in PUNCTUATION: _str1[-1] = '' str1 = "" #import pdb; pdb.set_trace() nsubj = ' '.join(nsubj.split()) _str1_ = ' '.join(_str1) #if xcomp_tag == 'VBN': if nsubj.lower() + ' ' in _str1_.lower(): str1 = _str1_ else: str1 = nsubj + " " + verb + " " + _str1_.lower() """ #elif xcomp_tag == 'VBG': if nsubj.lower() in _str1_.lower(): str1 = _str1_ else: str1 = nsubj + _str1_ """ #print "1st sent: ", str1 # upper the 1st char in 2nd sent #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind]) #import pdb; pdb.set_trace() _str2 = "" str2 = "" if nsubj_ind < split_ind: _strs = tokens[split_ind+1:] if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()): _strs = tokens[split_ind+2:] _str2 = " ".join(_strs) #TODO: update the tense #_str2 = tokens[root_ind:] #_str2 = tokens[split_ind+1:] #w = _w + ' ' #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2) nsubj = nsubj.strip() if nsubj: _nsubj = nsubj[0].upper() + nsubj[1:] if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She': str2 = _nsubj + " " + _str2 else: #sent2 = _nsubj + " " + _str2 #nsubj2 = base.replace_nsubj(sent2, nsubj) #str2 = nsubj2 + _str2 str2 = _nsubj + " " + _str2 else: str2 = base.upper_first_char(nsubj) + " " + ' '.join(tokens[split_ind+2:]) #str2 = "That" + " " + ' '.join(_str2) #print "2nd sent: ", str2 #import pdb; pdb.set_trace() if str1: if str2: strs = str1 + ' . ' + str2 else: strs = str1 + ' . ' else: strs = str2 + ' . ' return strs return strs
def pass2act(doc, rec=False): parse = nlp(doc) newdoc = '' for sent in parse.sents: # Init parts of sentence to capture: subjpass = '' subj = '' verb = '' verbaspect = '' verbtense = '' adverb = {'bef':'', 'aft':''} part = '' prep = '' agent = '' aplural = False advcltree = None aux = list(list(nlp('. .').sents)[0]) # start with 2 'null' elements xcomp = '' punc = '.' # Analyse dependency tree: for word in sent: if word.dep_ == 'advcl': if word.head.dep_ in ('ROOT', 'auxpass'): advcltree = word.subtree if word.dep_ == 'nsubjpass': if word.head.dep_ == 'ROOT': subjpass = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'nsubj': subj = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() if word.head.dep_ == 'auxpass': if word.head.head.dep_ == 'ROOT': subjpass = subj if word.dep_ in ('advmod','npadvmod','oprd'): if word.head.dep_ == 'ROOT': if verb == '': adverb['bef'] = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() else: adverb['aft'] = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'auxpass': if word.head.dep_ == 'ROOT': if not subjpass: subjpass = subj if word.dep_ in ('aux','auxpass','neg'): if word.head.dep_ == 'ROOT': aux += [word] if word.dep_ == 'ROOT': verb = word.text if word.tag_ == 'VB': verbtense = en.INFINITIVE elif word.tag_ == 'VBD': verbtense = en.PAST elif word.tag_ == 'VBG': verbtense = en.PRESENT verbaspect = en.PROGRESSIVE elif word.tag_ == 'VBN': verbtense = en.PAST else: verbtense = en.tenses(word.text)[0][0] if word.dep_ == 'prt': if word.head.dep_ == 'ROOT': part = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_ == 'prep': if word.head.dep_ == 'ROOT': prep = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() if word.dep_.endswith('obj'): if word.head.dep_ == 'agent': if word.head.head.dep_ == 'ROOT': agent = ''.join(w.text + ', ' if w.dep_=='appos' else (w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws) for w in word.subtree).strip() aplural = word.tag_ in ('NNS','NNPS') if word.dep_ in ('xcomp','ccomp','conj'): if word.head.dep_ == 'ROOT': xcomp = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip() that = xcomp.startswith('that') xcomp = pass2act(xcomp, True).strip(' .') if not xcomp.startswith('that') and that: xcomp = 'that '+xcomp if word.dep_ == 'punct' and not rec: if word.text != '"': punc = word.text # exit if not passive: if subjpass == '': newdoc += str(sent) + ' ' continue # if no agent is found: if agent == '': # what am I gonna do? BITconEEEEEEECT!!!! newdoc += str(sent) + ' ' continue # invert nouns: agent = nouninv(agent) subjpass = nouninv(subjpass) # F*****G CONJUGATION!!!!!!!!!!!!!: auxstr = '' num = en.SINGULAR if not aplural or agent in ('he','she') else en.PLURAL aux.append(aux[0]) verbaspect = None for (pp, p, a, n) in zip(aux,aux[1:],aux[2:],aux[3:]): if a.lemma_ == '.': continue if a.lemma_ == 'not': if p.lemma_ == 'be': if n.lemma_ == 'be': verbtense = en.tenses(a.text)[0][0] auxstr += en.conjugate('be',tense=en.tenses(p.text)[0][0],number=num) + ' ' verbaspect = en.PROGRESSIVE else: auxstr += en.conjugate('do',tense=en.tenses(p.text)[0][0],number=num) + ' ' verbtense = en.INFINITIVE auxstr += 'not ' elif a.lemma_ == 'be': if p.lemma_ == 'be': verbtense = en.tenses(a.text)[0][0] auxstr += en.conjugate('be',tense=en.tenses(a.text)[0][0],number=num) + ' ' verbaspect = en.PROGRESSIVE elif p.tag_ == 'MD': verbtense = en.INFINITIVE elif a.lemma_ == 'have': num == en.PLURAL if p.tag_ == 'MD' else num auxstr += en.conjugate('have',tense=en.tenses(a.text)[0][0],number=num) + ' ' if n.lemma_ == 'be': verbaspect = en.PROGRESSIVE verbtense = en.tenses(n.text)[0][0] else: auxstr += a.text_with_ws auxstr = auxstr.lower().strip() if verbaspect: verb = en.conjugate(verb,tense=verbtense,aspect=verbaspect) else: verb = en.conjugate(verb,tense=verbtense) advcl = '' if advcltree: for w in advcltree: if w.pos_ == 'VERB' and en.tenses(w.text)[0][4] == en.PROGRESSIVE: advcl += 'which ' + en.conjugate(w.text,tense=en.tenses(verb)[0][0]) + ' ' else: advcl += w.text_with_ws newsent = ' '.join(list(filter(None, [agent,auxstr,adverb['bef'],verb,part,subjpass,adverb['aft'],advcl,prep,xcomp])))+punc if not rec: newsent = newsent[0].upper() + newsent[1:] newdoc += newsent + ' ' return newdoc
# The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print "lexeme:", lexeme("be") print "lemma:", lemma("was") print # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False) print conjugate("being", tense="1sg", negated=False) print # Prefer the full constants for code that will be reused/shared. # The tenses() function returns a list of all tenses for the given verb form. # Each tense is a tuple of (tense, person, number, mood, aspect). # For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT, PLURAL print tenses("are") print (PRESENT, 1, PLURAL) in tenses("are") print "pl" in tenses("are")
# The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False) print conjugate("being", tense="1sg", negated=False) print # Prefer the full constants for code that will be reused/shared. # The tenses() function returns a list of all tenses for the given verb form. # Each tense is a tuple of (tense, person, number, mood, aspect). # For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT, PLURAL print tenses("are") print(PRESENT, 1, PLURAL) in tenses("are") print "pl" in tenses("are")
def simp_passive_sent(tokens, node_list): dict1 = { 'me': 'I', 'him': 'He', 'her': 'She', 'them': 'They', 'i': 'me', 'he': 'him', 'she': 'her', 'they': 'them' } strs = "" """ # the original tokens in the sent #import pdb; pdb.set_trace() print(sent) #import pdb; pdb.set_trace() tokens = StanfordTokenizer().tokenize(str(sent)) tokens.insert(0, '') result = list(eng_parser.raw_parse(sent))[0] root = result.root['word'] #w = result.tree() #print "parse_tree:", w #TODO: use the tree structure, check again node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]]) for node in result.nodes.items(): node_list.append(base.get_triples(node)) #node_list[base.get_triples[0]] = base.get_triples(node) """ root = "" root_ind = node_list[0][4]['root'][0] for nd in node_list: if root_ind == nd[0]: root=nd[1] #split_ind = 0 for nd in node_list[1:]: #import pdb; pdb.set_trace() #print(nd) # A passive nominal subjec if (root in nd) and ('nsubjpass' in nd[4].keys()): pass if (root in nd) and ('nsubjpass' in nd[4].keys()): #print "conj: ", nd #print "conj node: ", nd[4]['conj'] #import pdb; pdb.set_trace() nsubjpass_ind = nd[4]['nsubjpass'][0] det_ind = 0 #amod_ind_list = [] # the list of adjectival modifier for _nd in node_list: if nsubjpass_ind == _nd[0]: if ('det' in _nd[4].keys()): det_ind = _nd[4]['det'][0] #amod_ind_list = _nd[4]['amod'] #import pdb; pdb.set_trace() nsubjpass = tokens[nsubjpass_ind] # amod """ amod_list = "" if len(amod_ind_list) > 0: for i in amod_ind_list: amod_list = amod_list + " " + tokens[i] nsubjpass = amod_list + " " + nsubjpass """ amod_list = base.get_dependency_list(tokens, node_list, nsubjpass_ind) nsubjpass = amod_list + " " + nsubjpass # det #import pdb; pdb.set_trace() if det_ind: nsubjpass = tokens[det_ind] + " " + nsubjpass elif str(nsubjpass.lower().strip()) in dict1: nsubjpass = dict1[str(nsubjpass.lower().strip())] else: pass auxpass_ind = 0 if ('auxpass' in nd[4].keys()): auxpass_ind = nd[4]['auxpass'][0] #det_ind = 0 subj = "" if ('nmod' in nd[4].keys()): # bugs: the case nmod_ind_list = nd[4]['nmod'] case_ind = 0 case_ind_2 = 0 for nmod_ind in nmod_ind_list: _case_ind = 0 for nd in node_list[1:]: if nmod_ind == nd[0]: if ('case' in nd[4].keys()): _case_ind = nd[4]['case'][0] break # check whether the agent is explicitly stated using "by" if _case_ind > 0: if tokens[_case_ind] == 'by': case_ind = _case_ind break else: case_ind_2 = _case_ind #import pdb; pdb.set_trace() if case_ind == 0: return strs #if tokens[case_ind] != 'by': # return strs nmod_dict = {} for _nd in node_list[1:]: #BUG if nmod_ind == _nd[0]: nmod_dict = _nd[4] break #import pdb; pdb.set_trace() #if ('case' in nmod_dict.keys()): # 'by' #[NOTICE]: connect the nsubj + acl as 1st #import pdb; pdb.set_trace() det_ind = 0 nsubj_compound_list = [] if ('det' in nmod_dict): det_ind = nmod_dict['det'][0] if ('compound' in nmod_dict): nsubj_compound_list = nmod_dict['compound'] for i in nsubj_compound_list: subj = subj + " " + tokens[i] if det_ind: subj = base.upper_first_char(tokens[det_ind]) + " " + subj + tokens[nmod_ind] elif tokens[nmod_ind] in dict1: subj = dict1[tokens[nmod_ind]] else: subj = subj + " " + tokens[nmod_ind] #import pdb; pdb.set_trace() verb = root if len(tenses(root)) > 0: if auxpass_ind != 0: if subj.strip().lower() == 'they': verb = conjugate(root, tenses(tokens[auxpass_ind])[0][0], 2) else: verb = conjugate(root, tenses(tokens[auxpass_ind])[0][0], 3) else: if subj.strip().lower() == 'they': verb = conjugate(root, tenses(root)[0][0], 2) else: verb = conjugate(root, tenses(root)[0][0], 3) #import pdb; pdb.set_trace() if case_ind_2 > 0: _case_str = " ".join(tokens[case_ind_2:case_ind]) strs = subj + " " + verb + " " + nsubjpass.lower() + " " + _case_str + " ." else: strs = subj + " " + verb + " " + nsubjpass.lower() + " ." return strs """ #[NOTICE]: remove the ',' after the nsubj if tokens[nsubj_ind + 1] in PUNCTUATION: tokens[nsubj_ind + 1] = '' tokens.insert(nsubj_ind + 1, verb) #root_ind = tokens.index(root) #_str1 = tokens[nsubj_ind:root_ind] if _str1[-1] in PUNCTUATION: _str1[-1] = '' str1 = ' '.join(_str1) #print "1st sent: ", str1 # upper the 1st char in 2nd sent _str2 = tokens[root_ind:] #w = _w + ' ' str2 = upper_first_char(subj) + " " + ' '.join(_str2) #print "2nd sent: ", str2 """ #strs = str1 + ' . ' + str2 #return strs return strs