def PVD(document): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') sentence = sent_tokenize(document) word = 'se' lemma = 'definir' for i in range(len(sentence)): #print('--------------------') pattern = list() postaglist = list() tokens = nltk.word_tokenize(sentence[i]) tag = pos_tagger.tag(tokens) for t in tag: if ('se' in tokens): pos = tokens.index('se') front = tokens[pos + 1:pos + 2] tag = pos_tagger.tag(front) doc = nlp(t[0]) lemlist = [tok.lemma_ for tok in doc] #lem=''.join(lemlist) #lemmas_list.append(lem) #print(lemma, '-', lemlist) if ('definir' in lemlist or 'entender' in lemlist or 'denominar' in lemlist): #print(sentence[i]) front = tokens[pos + 2:pos + 5] if (t[1] == 'PUNCT'): pos = tokens.index(t[0]) print(t[0], pos, tag[pos + 1]) '''if(t[1]=='AUX'):
def pos_tagger_lemma(document, listterms): print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram') text = str() definiendums = list() pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') for i in document: if (len(i) > 1): tag = pos_tagger.tag(i.split(' ')) for t in tag: if (t[1] == 'VERB'): doc = nlp(t[0]) for tok in doc: l = tok.lemma_ if (l == 'ser'): text = i indverb = i.index(t[0]) front = i[indverb:] back = i[:indverb + len(t[0]) + 1] tagfront = pos_tagger.tag(front.split(' ')) tagback = pos_tagger.tag(back.split(' ')) definiendum_definition(t[0], text, listterms) elif (t[1] == 'NOUN' and t[0] != '=RRB='): text = i if (len(t[0]) > 1): #definiendum_definition(t[0], text, listterms) pass return (text)
def check_triples_by_pos(triples): pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos') ret_triples = [] for triple in triples: source = triple[0] relation = triple[1] target = triple[2] source_pos = ",".join( [e[1] for e in pos_tagger.tag(source.split(" "))]) relation_pos = ",".join( [e[1] for e in pos_tagger.tag(relation.split(" "))]) target_pos = ",".join( [e[1] for e in pos_tagger.tag(target.split(" "))]) if "VB" in source_pos or "VB" in target_pos: continue if "NN" not in source_pos or "NN" not in target_pos: continue if "NN" in relation_pos: if " at" in relation.lower(): relation = "at" elif "of" not in relation.split(" ") and len( relation.split(" ")) > 1: continue ret_triples.append([source, relation, target]) return ret_triples
def tagging(file): qstn = [] qstn_ner = [] qstn_pos = [] ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') f = open(file) for line in f: qstn.append(line) #Saving pos tagging to list qstn_pos.append(pos_tagger.tag(line.split())) #Saving NER to list qstn_ner.append(ner_tagger.tag(line.split())) return qstn, qstn_ner, qstn_pos
def pos_tag_text(text): def penn_to_wn_tags(pos_tag): if pos_tag.startswith('J'): return wn.ADJ elif pos_tag.startswith('V'): return wn.VERB elif pos_tag.startswith('N'): return wn.NOUN elif pos_tag.startswith('R'): return wn.ADV else: return None #print("ORIGINAL TEXT ---------------",text) #tagged_text = tag(text) pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tagged_text = pos_tagger.tag(text.split()) #tagged_text = nltk.pos_tag(text) #print("TAGGED TEXT ---------",tagged_text) tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text] return tagged_lower_text
def pruebas(): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') tag = pos_tagger.tag( 'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente' .split(' ')) doc = nlp('considerará') lemlist = [tok.lemma_ for tok in doc] print(lemlist)
def scorer(title, speaker): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 return -s + abs(title[1] - speaker[1]) * 0.3
def verb_stats(data): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') verb_count = 0 for _, value in data.items(): pos = list(pos_tagger.tag(value.split())) for _, second in pos: if second.startswith("V"): verb_count += 1 print(verb_count)
def get_probable_title(titles): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') score = [] for title in titles: pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 score.append((s, len(title[0]), title[0], title[1])) return max(score)
def get_speaker_salutaion(text, persons, speakers): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') text = text.split() pos_tag_list = list(pos_tagger.tag(text)) for i in range(len(pos_tag_list) - 1): if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons: if pos_tag_list[i + 1] in persons: for speak in speakers: if pos_tag_list[i + 1] in speak: speak = pos_tag_list[i] + ' ' + speak break return speakers
def get_stanford_pos_tags(line): """ Get part of speech tags using the Stanford POS tagger """ st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos") tokenized_line = cnf.TOKENIZER.tokenize(line) line_tagged_initial = st_pos.tag(tokenized_line) line_tagged_output = [] for item in line_tagged_initial: line_tagged_output.append((item[0], item[1])) return line_tagged_output
def get_entities(tweets): entities = [] for atweet in tweets: for sent in atweet: pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') sent = list(pos_tagger.tag(normalize(sent))) # sent = pos_tag(normalize(sent)) trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities
def sfNERTagger(rawText): '''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag''' parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner') tupleList = list(parser.tag(rawText.split())) #convert list of tuple to list of lists, so we can change tags we dont need NERList = [list(tuple) for tuple in tupleList] #change tags we dont need for item in NERList: if item[1] == 'COUNTRY': item[1] = 'COU' elif item[1] == 'PERSON': item[1] = 'PER' elif item[1] == 'CITY': item[1] = 'CIT' elif item[1] == 'ORGANIZATION': item[1] = 'ORG' else: item[1] = '' return NERList
def ed_rip(word: str): NV = False pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') nlpinfo = nlp(word.lower()) ripword = nlpinfo.sentences[0].words[0].lemma # Recapitalization if re.search('^[A-Z]', word) != None: ripword = ripword.capitalize() # Return information needed to determine NV Passive. riptoken = nltk.word_tokenize(ripword) riptag = pos_tagger.tag(riptoken)[0][1] print(riptoken, riptag) if riptag.startswith('V') is True: NV = False elif riptag.startswith('N') is True: NV = True return (ripword, NV)
def punctuation_funct(document, listterms): print( 'Definición por punctuación [:, termino seguido de coma y acabando en coma verbo' ) text = str() definiendum = str() definiendums = list() for i in document: for j in listterms: term = j[:-1] if (len(i) > 1): if (term + ':' in i): ind = i.index(':') after = i[ind + 1:] if (len(after) > 1 and term not in definiendums): definiendum = term definition = after definiendums.append(definiendum) print(definiendum, '---->', definition) elif (term + ',' in i): indterm = i.index(term) if (',' in i[indterm + len(term):indterm + len(term) + 1]): #print('-') front = i[indterm:-1] pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') tag = pos_tagger.tag(i.split(' ')) for t in tag: if (t[1] == 'VERB'): #print(front) if (t[0] in i): #print(t[0]) indverb = i.index(t[0]) if (i[indverb - 2] == ','): definiendum = term definition = i[indterm + len(term) + 1:indverb] if (len(definiendum) > 1 and len(definition) > 1 and definiendum not in definiendums): definiendums.append(definiendum) print(definiendum, '---->', definition)
def test(sent_pool, t): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') for s in sent_pool: result = NV_Passive(pos_tagger.tag(word_tokenize(s))) print(result) if t == 'f': if result[1] == False and result[2] == False: print('False test passed.') else: print('▇▇False test failed.') elif t == 'p': if result[1] == True and result[2] == False: print('Passive test passed.') else: print('▇▇Passive test failed.') elif t == 't': if result[1] == True and result[2] == True: print('True test passed.') else: print('▇▇True test failed.')
def get_stanford_named_entities(line): """ Get named entities from the Stanford NER tagger """ entity_item = "" entity_list = [] previous_type = "" st_ner = CoreNLPParser(url="http://localhost:9000", tagtype="ner") line_tagged_ner = st_ner.tag(line.split()) line_tagged = [] # Tag the input using the tagger # The tagger returns in the order (entity, type). Change the # order to be consistent with the other taggers (type, entity) for item_ner in line_tagged_ner: if item_ner[1] != "O": line_tagged.append((item_ner[1], item_ner[0])) # Consolidate multi-word entities if len(line_tagged) == 1: entity_list = line_tagged elif len(line_tagged) > 1: for index, item in enumerate(line_tagged): if item[0] == previous_type: if item[0] != "O": entity_item += " " + item[1] else: if item[0] != "O": entity_item = item[1] else: entity_list.append((previous_type, entity_item)) entity_item = "" if index == (len(line_tagged) - 1): entity_list.append((previous_type, entity_item)) previous_type = item[0] return entity_list
def find_title_mentioned(title_lines): titles = [] for line in title_lines: pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') pos_tag_list = list(pos_tagger.tag(line[0].split())) m = len(pos_tag_list) preposition_list = [] for j in range(m): if pos_tag_list[j][0] == 'on': preposition_list.append(j) preposition_list.append(m) preposition_combinations = list(combinations(preposition_list, 2)) for j in preposition_combinations: word_list = [j[0] for j in pos_tag_list] title = ' '.join(word_list[j[0] + 1:j[1] + 1]) x = 0 for k in range(len(title)): if title[k].lower().isalpha() == True: x = k break title = title[m:] titles.append((title, line[1])) preposition_list = [] for j in range(m): if pos_tag_list[j][0] == 'is': preposition_list.append(j) preposition_list.append(m) preposition_combinations = list(combinations(preposition_list, 2)) for j in preposition_combinations: word_list = [j[0] for j in pos_tag_list] title = ' '.join(word_list[j[0] + 1:j[1] + 1]) x = 0 for k in range(len(title)): if title[k].lower().isalpha() == True: x = k break title = title[x:] titles.append((title, line[1])) return titles
def en_ner(self): ner_tagger = CoreNLPParser(url=DEFAULT_LOCAL_ADDRESS + ":" + DEFAULT_EN_NER_PORT, tagtype='ner') for line in self.textMap.keys(): taggedText = ner_tagger.tag((line.split())) try: for text, value in taggedText: if value in ['PERSON']: self.textMap[line][self.PER_KEY] += 1 if value in ['LOCATION']: self.textMap[line][self.LOC_KEY] += 1 if value in ['ORGANIZATION']: self.textMap[line][self.ORG_KEY] += 1 if value in ['TITLE']: self.textMap[line][self.TIT_KEY] += 1 if value in ['NUMBER']: continue except Exception as e: print("Unable to anotate " + str(line)) print(e) return e
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat May 16 00:28:43 2020 @author: mingxi """ from nltk.parse import CoreNLPParser pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') # extract pos tags for the Tedlium corpus base_text = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/Base/Base_all.txt', 'r').read() base_pos = [] for i in base_text.split('.'): base_pos.append(list(pos_tagger.tag((i + '.').split()))) base_pos2 = [] for i in base_pos: if i[0][0] != '.': for j in i: base_pos2.append(j[1]) out = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/base_pos.txt', 'w') out.write('\n'.join(base_pos2)) out.close()
def question_pipeline(question): lemmatizer = WordNetLemmatizer() porter = PorterStemmer() # stanford corenlp is expected to run at localhost:9000 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') corpus_dict = {} count = 0 sent_text = question tokenized_text = nltk.word_tokenize(sent_text) question_types = ['who', 'when', 'where', 'Who', 'When', 'Where'] type_of_question = [i for i in question_types if i in tokenized_text] lemma = [lemmatizer.lemmatize(word) for word in tokenized_text] stemmed = [porter.stem(word) for word in tokenized_text] # Stemming the words # POS tagging the words to extract POS features tagged = nltk.pos_tag(tokenized_text) parse, = dep_parser.raw_parse(question) # Dependency parsing to parse tree based patters as features dependency_parse = list(parse.triples()) # LESK to extract best sense of a word best_sense = [lesk(question, word) for word in tokenized_text] # tokenized_text_ner = nltk.word_tokenize(sent_text) #Tokenizing sentences into words ner_tag = ner_tagger.tag(tokenized_text) head_list = [] striped_sentence = sent_text.strip(" '\"") if striped_sentence != "": dependency_parser = dep_parser.raw_parse(striped_sentence) parsetree = list(dependency_parser)[0] head_word = "" head_word = [ k["word"] for k in parsetree.nodes.values() if k["head"] == 0 ][0] if head_word != "": head_list.append([head_word]) else: for i, pp in enumerate(tagged): if pp.startswith("VB"): head_list.append([tokenized_text[i]]) break if head_word == "": for i, pp in enumerate(tagged): if pp.startswith("NN"): head_list.append([tokenized_text[i]]) break else: head_list.append([""]) synonym_list = [] hypernym_list = [] hyponym_list = [] meronym_list = [] holonym_list = [] for t in tokenized_text: best_sense = lesk(sent_text, t) # LESK to extract best sense of a word if best_sense is not None: this_synonym = t if best_sense.lemmas()[0].name() != t: this_synonym = best_sense.lemmas()[0].name() synonym_list.append(this_synonym) if best_sense.hypernyms() != []: hypernym_list.append( best_sense.hypernyms()[0].lemmas()[0].name()) if best_sense.hyponyms() != []: hyponym_list.append( best_sense.hyponyms()[0].lemmas()[0].name()) if best_sense.part_meronyms() != []: meronym_list.append( best_sense.part_meronyms()[0].lemmas()[0].name()) if best_sense.part_holonyms() != []: holonym_list.append( best_sense.part_holonyms()[0].lemmas()[0].name()) else: synonym_list.append(t) count = count + 1 corpus_dict[count] = {} corpus_dict[count]["sentence"] = {} corpus_dict[count]["sentence"] = sent_text corpus_dict[count]["type_of_question"] = {} corpus_dict[count]["type_of_question"] = type_of_question corpus_dict[count]["tokenized_text"] = {} corpus_dict[count]["tokenized_text"] = tokenized_text corpus_dict[count]["lemma"] = {} corpus_dict[count]["lemma"] = lemma corpus_dict[count]["stemmed"] = {} corpus_dict[count]["stemmed"] = stemmed corpus_dict[count]["tagged"] = {} corpus_dict[count]["tagged"] = tagged corpus_dict[count]["dependency_parse"] = {} corpus_dict[count]["dependency_parse"] = dependency_parse corpus_dict[count]["synonyms"] = {} corpus_dict[count]["synonyms"] = synonym_list corpus_dict[count]["hypernyms"] = {} corpus_dict[count]["hypernyms"] = hypernym_list corpus_dict[count]["hyponyms"] = {} corpus_dict[count]["hyponyms"] = hyponym_list corpus_dict[count]["meronyms"] = {} corpus_dict[count]["meronyms"] = meronym_list corpus_dict[count]["holonyms"] = {} corpus_dict[count]["holonyms"] = holonym_list corpus_dict[count]["ner_tag"] = {} corpus_dict[count]["ner_tag"] = dict(ner_tag) corpus_dict[count]["head_word"] = {} corpus_dict[count]["head_word"] = head_list[0] return corpus_dict
trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities # run this you have to connect to api # go to dir - stanford-corenlp-full-2018-02-27 # the two lines below type in terminal as one line # java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer # -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 & from nltk.parse import CoreNLPParser parser = CoreNLPParser(url='http://localhost:9000') list(parser.parse(doc)) # for sentence tokenized doc list(parser.raw_parse(doc)) # for non tokenized docs # on tokenized list of words pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') list(pos_tagger.tag(doc)) ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') list(ner_tagger.tag(doc)) from nltk.parse.corenlp import CoreNLPDependencyParser dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') list(dep_parser.parse(doc))
def extractFeatures(): stop_words = stopwords.words('english') + list(string.punctuation) file_loc='wikiTest/' os.chdir('/Users/ranjithreddykommidi/NLP/Project/wikiTest') file_names = glob.glob('*.txt') #Read every wikipedia articles given in the input fileList for file in file_names: readfile = open(file, 'r') text = readfile.read() corpus = {} sent_text = nltk.sent_tokenize(text) dep_parser = CoreNLPDependencyParser(url='http://localhost:9010') ner_tagger = CoreNLPParser(url='http://localhost:9010', tagtype='ner') count = 0 for sentence in sent_text: tokenized_text = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words] lemma = [WordNetLemmatizer().lemmatize(word) for word in tokenized_text] stemmed = [PorterStemmer().stem(word) for word in tokenized_text] tagged = nltk.pos_tag(tokenized_text) parse, = dep_parser.raw_parse(sentence) dependency_parse = list(parse.triples()) tokenized_text_ner = nltk.word_tokenize(sentence) try: ner_tag = ner_tagger.tag(tokenized_text_ner) except: ner_tag = ner_tagger.tag(tokenized_text) Synonym = [] Hypernym = [] Hyponym = [] Meronym = [] Holonym = [] Heads = [] for t in tokenized_text: Nyms = lesk(sentence, t) if Nyms is not None: this_synonym = t if Nyms.lemmas()[0].name() != t:this_synonym = Nyms.lemmas()[0].name() Synonym.append(this_synonym) if Nyms.hypernyms() != []:Hypernym.append(Nyms.hypernyms()[0].lemmas()[0].name()) if Nyms.hyponyms() != []:Hyponym.append(Nyms.hyponyms()[0].lemmas()[0].name()) if Nyms.part_meronyms() != []:Meronym.append(Nyms.part_meronyms()[0].lemmas()[0].name()) if Nyms.part_holonyms() != []:Holonym.append(Nyms.part_holonyms()[0].lemmas()[0].name()) else: Synonym.append(t) striped_sentence = sentence.strip(" '\"") if striped_sentence != "": dependency_parser = dep_parser.raw_parse(striped_sentence) parsetree = list(dependency_parser)[0] head_word = "" head_word = [k["word"] for k in parsetree.nodes.values() if k["head"] == 0][0] if head_word != "": Heads.append([head_word]) else: for i, pp in enumerate(tagged): if pp.startswith("VB"): Heads.append([tokenized_text[i]]) break if head_word == "": for i, pp in enumerate(tagged): if pp.startswith("NN"): Heads.append([tokenized_text[i]]) break else: Heads.append([""]) count = count + 1 corpus[count] = {} corpus[count]["sentence"] = {} corpus[count]["sentence"] = sentence corpus[count]["tokenized_text"] = {} corpus[count]["tokenized_text"] = tokenized_text corpus[count]["lemma"] = {} corpus[count]["lemma"] = lemma corpus[count]["stem"] = {} corpus[count]["stem"] = stemmed corpus[count]["tag"] = {} corpus[count]["tag"] = tagged corpus[count]["dependency_parse"] = {} corpus[count]["dependency_parse"] = dependency_parse corpus[count]["synonyms"] = {} corpus[count]["synonyms"] = Synonym corpus[count]["hypernyms"] = {} corpus[count]["hypernyms"] = Hypernym corpus[count]["hyponyms"] = {} corpus[count]["hyponyms"] = Hyponym corpus[count]["meronyms"] = {} corpus[count]["meronyms"] = Meronym corpus[count]["holonyms"] = {} corpus[count]["holonyms"] = Holonym corpus[count]["ner_tag"] = {} corpus[count]["ner_tag"] = str(dict(ner_tag)) corpus[count]["head_word"] = {} corpus[count]["head_word"] = Heads[0] corpus[count]["file_name"] = {} corpus[count]["file_name"] = file[len(file_loc):] outputName = file[len(file_loc)] json_object = json.dumps(corpus, indent = 4) with open(outputName, "w") as f: f.write(json_object)
if i not in stop_words ] # Tokenizing sentences into words # Lemmatizing the words to extract lemmas as features lemma = [lemmatizer.lemmatize(word) for word in tokenized_text] stemmed = [porter.stem(word) for word in tokenized_text] # Stemming the words # POS tagging the words to extract POS features tagged = nltk.pos_tag(tokenized_text) parse, = dep_parser.raw_parse(sentence) # Dependency parsing to parse tree based patters as features dependency_parse = list(parse.triples()) # best_sense = [lesk(sentence, word) for word in tokenized_text] #LESK to extract best sense of a word tokenized_text_ner = nltk.word_tokenize( sentence) # Tokenizing sentences into words try: ner_tag = ner_tagger.tag(tokenized_text_ner) except: ner_tag = ner_tagger.tag(tokenized_text) head_list = [] striped_sentence = sentence.strip(" '\"") if striped_sentence != "": dependency_parser = dep_parser.raw_parse(striped_sentence) parsetree = list(dependency_parser)[0] head_word = "" head_word = [ k["word"] for k in parsetree.nodes.values() if k["head"] == 0 ][0] if head_word != "": head_list.append([head_word]) else: for i, pp in enumerate(tagged):
driver.execute_script("arguments[0].click();", button2.find_elements_by_class_name('_ni9axhe')[1]) all_text = "" #add description details for paragraph in button.find_elements_by_class_name('_6z3til'): all_text = all_text + " " + paragraph.text #add neighborhood info for paragraph in button2.find_elements_by_class_name('_6z3til'): all_text = all_text + " " + paragraph.text print("Tagging text...") #get all locations/cities from text on website the_list = [ x[0] for x in ner_tagger.tag(all_text.split()) if x[1] == 'LOCATION' or x[1] == 'CITY' ] #find the borough borough = find_borough(the_list) print("\n%s is in %s" % (url, borough.upper())) #get crime data for borough links_to_crime[url] = get_crime_data(borough) #sort the listings by crimes that have occurred ordered_listings = sorted(links_to_crime.items(), key=lambda x: x[1]) print("\n\nFINAL ORDER IS (also found in ordered.txt): ", ordered_listings) write_listings_ordered(ordered_listings)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat May 16 00:28:43 2020 @author: mingxi """ from nltk.parse import CoreNLPParser pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') # extract pos tags for the Tedlium corpus ted_text = open( '/Users/mingxi/Desktop/TEMP/DISS/Grammar/TEDLIUM_release2/stm_processed_final.txt', 'r').read() ted_pos = [] for i in ted_text.split('.'): ted_pos.append(list(pos_tagger.tag((i.capitalize() + '.').split()))) ted_pos2 = [] for i in ted_pos: if i[0][0] != '.': for j in i: ted_pos2.append(j[1]) out = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/ted_pos.txt', 'w') out.write('\n'.join(ted_pos2)) out.close()
def process(text): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tags = pos_tagger.tag(text) return tags
# query = "Is Kubrick a director?" # query = "Was Birdman the best movie in 2015?" query = "Who directed Schindler's List?" # query = "Who won the oscar for best actor in 2005?" # query = "Which movie won the oscar in 2000?" # query = "Who directed the best movie in 2010?" # query = "Did Allen direct Mighty Aphrodite?" finalquery = query # queryNers = ner_tagger.tag((query.split())) # for i in queryNers: # if i[1]=="DATE": # query = query.replace(i[0],"Date") # print(query) personName,movieName =[],[] pos_tags=list(pos_tagger.tag(query.split())) pos_tags.append(('last','$$$')) print(pos_tags) nnpTags,year = [],0 for i,k in enumerate(pos_tags): if pos_tags[i+1][1] == '$$$': break elif pos_tags[i][1]=='NNP' and pos_tags[i+1][1]!='NNP': nnpTags.append(pos_tags[i][0]) elif pos_tags[i][1]=='NNP' and pos_tags[i+1][1]=='NNP': nnpTags.append(pos_tags[i+1][0]) if pos_tags[i][1]=='CD': year=pos_tags[i][0] print(nnpTags) for i in nnpTags:
for governor, dep, dependent in parse.triples()] for parse in parses]) print( "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n" ) # Tokenizer parser = CoreNLPParser(url='http://localhost:9000') print(list(parser.tokenize('What is the airspeed of an unladen swallow?'))) print( "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n" ) # POS Tagger pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') print( list(pos_tagger.tag( 'What is the airspeed of an unladen swallow ?'.split()))) print( "\nExpected: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]\n" ) # NER Tagger ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') print( list( ner_tagger.tag( ('Rami Eid is studying at Stony Brook University in NY'.split())))) print( "\nExpected: [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]\n" )
def read_from_textgrid(self, file_list): pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') lex_table = read_lex_table(lex_table_path) variant_match = dict() for r in zip(lex_table['word_variant'], lex_table['word_standard'], lex_table['word_vars'], lex_table['POS_tag']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] else: print(v_pattern) # add it? no variant_match[v_pattern].append(r) gehen_variants = set() locations = lex_table.loc[lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0]) gehen_variants.add(g_pattern) # for gehen_row in lex_table.loc[lex_table['word_lemma'] == 'gehen']['word_variant']: # # check the word_vars # if not any("SAF5" in wv for wv in lex_table.loc[lex_table['word_variant'] == gehen_row]['word_vars']): # g_pattern = compile_pattern(gehen_row) # gehen_variants.add(g_pattern) for each_file_name in file_list: # now combine the files of the same speakers print(each_file_name) interval_num = 0 file_path = self.tg_path + each_file_name try: file_textgrid_obj = textgrid.TextGrid.fromFile(file_path) except UnicodeDecodeError: print(each_file_name + ': the encode is weird, not utf-8 or ansi') tier_list = file_textgrid_obj.tiers for each_tier in tier_list: if each_tier.name == 'SWG': # read from swg tier tier_swg = each_tier intervals_swg = tier_swg.intervals try: clauses = [] clause_annotation = [] time_segment = dict() skip = False begin_tag = '' for each_annotation in intervals_swg: annotation_mark = each_annotation.mark beg_hms = timestamp_convert(each_annotation.minTime) if not annotation_mark.strip(): continue punct = [',', '.', '!', '?'] # maybe just . ! ? tokens = annotation_mark.split() time_segment[beg_hms] = tokens for token in tokens: if any(p in token for p in punct ): # function that turn segments into clauses if all(c in string.punctuation for c in token ): # this is for token like ... --- and ??? if not clause_annotation: time_stamp = beg_hms clause_annotation.append(token) if len( token ) > 3 or token in punct: # why do I do this again, still don't know clause_annotation.append(time_stamp) clauses.append(clause_annotation) clause_annotation = [] continue word_punct_split = re.findall( r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]", token, re.UNICODE) # separate word with punctuation for wp in word_punct_split: # maybe to split annotations into clauses if not clause_annotation: time_stamp = beg_hms clause_annotation.append(wp) if all(c in punct for c in wp): clause_annotation.append(time_stamp) clauses.append(clause_annotation) clause_annotation = [] else: if not clause_annotation: time_stamp = beg_hms clause_annotation.append(token) for cl in clauses: if '[ANT]' in cl or '[REL]' in cl: # print("clause", cl) beg_hms = cl[-1] # print("time", beg_hms) cl = cl[:-1] # print("cl", cl) if cl[0] not in time_segment[ beg_hms]: # closer remaining is the punctuation problem segment_annotation = [] for token in time_segment[beg_hms]: segment_annotation += re.findall( r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]", token, re.UNICODE) if cl[0] not in segment_annotation: print(segment_annotation) print(cl[0]) else: segment_annotation = time_segment[beg_hms] sym_seq = segment_annotation.index(cl[0]) + 1 words_std = [] ddm_tags = [] pos_sent = [] # get ddm for i, word in enumerate(cl): if word: # empty word check # match w with word_variant std_list = set() ddm_list = set() pos_list = set() no_match = True rel = False # check for var: REL if i + 1 < len( cl): # make sure next word exist w_next = cl[i + 1] if "[REL]" in w_next: rel = True if "wo" in word: rel_var = " RELd" elif "als" in word or word.startswith( "d") or word.startswith( "wel") or word.startswith( "jed"): rel_var = " RELs" elif ("was" in word) or ( "wie" in word) or ("wer" in word): rel_var = " RLOs" else: rel_var = " UNK" for p in variant_match.keys(): if p.search(word) is not None: # .lower() no_match = False for values in variant_match[p]: swg = values[0].replace("*", "") # rum[ge]draat if "ge" in swg and "ge" not in word: swg = swg.replace( "ge", "g" ) # for gespielt gspielt std = values[1].replace("*", "") std_list.add(std) if isinstance( values[2], float ) and math.isnan( values[2] ): # check for empty var_code pass # do nothing else: ddm_list.add( values[2]) # should be set if isinstance( values[3], float) and math.isnan( values[3]): pos_list.add('*') else: pos_list.add(values[3]) if no_match: standard = word ddm = "*" pos = pos_tagger.tag([word])[0][1] if "$" in pos: pos = "*" else: standard = " ".join(std_list) ddm = " ".join(str(d) for d in ddm_list) if any("SAF5" in d for d in ddm_list): for g_pattern in gehen_variants: if g_pattern.search( word) is not None: print(ddm) print(word) print( "!" ) # gegang* [ge]gang* will be taged as SAF5 # k as prefix ddm = ddm.replace("SAF5d", "") ddm = ddm.replace("SAF5s", "") print(ddm) pos = " ".join(str(p) for p in pos_list) if rel: if ddm != "*": ddm = ddm + rel_var else: ddm = rel_var ddm = ddm.strip() words_std.append(standard) ddm_tags.append(ddm) pos_sent.append(pos) # columns self.output_as_csv( each_file_name[each_file_name.rfind("_") + 1:-9], beg_hms, sym_seq, " ".join(cl), " ".join(ddm_tags), " ".join(pos_sent)) except AttributeError as e: print(each_file_name + ': tier words is empty or does not exist ') traceback.print_tb(e.__traceback__)