def tagging(file): qstn = [] qstn_ner = [] qstn_pos = [] ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') f = open(file) for line in f: qstn.append(line) #Saving pos tagging to list qstn_pos.append(pos_tagger.tag(line.split())) #Saving NER to list qstn_ner.append(ner_tagger.tag(line.split())) return qstn, qstn_ner, qstn_pos
def pos_tag_text(text): def penn_to_wn_tags(pos_tag): if pos_tag.startswith('J'): return wn.ADJ elif pos_tag.startswith('V'): return wn.VERB elif pos_tag.startswith('N'): return wn.NOUN elif pos_tag.startswith('R'): return wn.ADV else: return None #print("ORIGINAL TEXT ---------------",text) #tagged_text = tag(text) pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') tagged_text = pos_tagger.tag(text.split()) #tagged_text = nltk.pos_tag(text) #print("TAGGED TEXT ---------",tagged_text) tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text] return tagged_lower_text
def parseSentenceStructure(data): #Tokenize sent. tokens = nltk.word_tokenize(data) #Tag sent. tagged = nltk.pos_tag(tokens) #Parser parser = CoreNLPParser( url='http://localhost:9000' ) #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566 dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') #Parse w/ Stanford tree = parser.raw_parse(data) #print(list(tree)) #list(tree)[0].pretty_print() #print(list(tree)) #Provide N-V-N relationships w/ all N combinations #Traverse for NP root tree_recurse_find(list(tree)[0])
def PVD(document): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') sentence = sent_tokenize(document) word = 'se' lemma = 'definir' for i in range(len(sentence)): #print('--------------------') pattern = list() postaglist = list() tokens = nltk.word_tokenize(sentence[i]) tag = pos_tagger.tag(tokens) for t in tag: if ('se' in tokens): pos = tokens.index('se') front = tokens[pos + 1:pos + 2] tag = pos_tagger.tag(front) doc = nlp(t[0]) lemlist = [tok.lemma_ for tok in doc] #lem=''.join(lemlist) #lemmas_list.append(lem) #print(lemma, '-', lemlist) if ('definir' in lemlist or 'entender' in lemlist or 'denominar' in lemlist): #print(sentence[i]) front = tokens[pos + 2:pos + 5] if (t[1] == 'PUNCT'): pos = tokens.index(t[0]) print(t[0], pos, tag[pos + 1]) '''if(t[1]=='AUX'):
def init_server(self): STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05") self.server = CoreNLPServer( os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"), os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar")) self.server.start() self.parser = CoreNLPParser()
def _parseSentences(sentences, parseCacheFile): def cacheKey(text): return text.strip().encode('utf-8') cache = shelve.open(parseCacheFile) toParse = [] for sentence in sentences: if cacheKey(sentence) not in cache: toParse.append(sentence) if toParse: p = Pool(10) parser = CoreNLPParser( url=os.getenv("CORENLP_HOST", "http://localhost:9000")) parseIterator = p.imap(lambda s: parser.parse_one(s.split()), toParse) progress = ProgressBar(len(toParse)) for i, parse in enumerate(parseIterator): cache[cacheKey(toParse[i])] = parse progress.done(i) progress.complete() parses = map(lambda s: cache[cacheKey(s)], sentences) cache.close() return parses
def __init__(self, w2v_path, corpus_dict_path, port=9000): # corenlp client self.parser = CoreNLPParser(url='http://localhost:' + str(port)) self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' + str(port)) # w2v self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format( 'data/saved_models/GoogleNews-vectors-negative300.bin', binary=True) print('w2v model loaded') # training corpus for one hot features corpus_dict = pickle.load(open(corpus_dict_path, 'rb')) self.dep_tuple_vectorizer = DictVectorizer(sparse=False) self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit( corpus_dict['dep_tuple']) self.unigram_vectorizer = DictVectorizer(sparse=False) self.unigram_vectorizer = self.unigram_vectorizer.fit( corpus_dict['unigram']) self.bigram_vectorizer = DictVectorizer(sparse=False) self.bigram_vectorizer = self.bigram_vectorizer.fit( corpus_dict['bigram']) self.trigram_vectorizer = DictVectorizer(sparse=False) self.trigram_vectorizer = self.trigram_vectorizer.fit( corpus_dict['trigram']) self.lexical_vectorizer = DictVectorizer(sparse=False) self.lexical_vectorizer = self.lexical_vectorizer.fit( corpus_dict['lexical'])
def dependency_parse(raw_data): from nltk.parse.corenlp import CoreNLPServer # The server needs to know the location of the following files: # - stanford-corenlp-X.X.X.jar # - stanford-corenlp-X.X.X-models.jar STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20") # Create the server server = CoreNLPServer( os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"), os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"), ) # Start the server in the background server.start() from nltk.parse import CoreNLPParser parser = CoreNLPParser() new_data = [] for example in raw_data: sentence, features_seq = example[0], example[-1] parse = next(parser.raw_parse(sentence)) # get a few "important" neighboring words server.stop()
def check_triples_by_pos(triples): pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos') ret_triples = [] for triple in triples: source = triple[0] relation = triple[1] target = triple[2] source_pos = ",".join( [e[1] for e in pos_tagger.tag(source.split(" "))]) relation_pos = ",".join( [e[1] for e in pos_tagger.tag(relation.split(" "))]) target_pos = ",".join( [e[1] for e in pos_tagger.tag(target.split(" "))]) if "VB" in source_pos or "VB" in target_pos: continue if "NN" not in source_pos or "NN" not in target_pos: continue if "NN" in relation_pos: if " at" in relation.lower(): relation = "at" elif "of" not in relation.split(" ") and len( relation.split(" ")) > 1: continue ret_triples.append([source, relation, target]) return ret_triples
def pos_tagger_lemma(document, listterms): print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram') text = str() definiendums = list() pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') for i in document: if (len(i) > 1): tag = pos_tagger.tag(i.split(' ')) for t in tag: if (t[1] == 'VERB'): doc = nlp(t[0]) for tok in doc: l = tok.lemma_ if (l == 'ser'): text = i indverb = i.index(t[0]) front = i[indverb:] back = i[:indverb + len(t[0]) + 1] tagfront = pos_tagger.tag(front.split(' ')) tagback = pos_tagger.tag(back.split(' ')) definiendum_definition(t[0], text, listterms) elif (t[1] == 'NOUN' and t[0] != '=RRB='): text = i if (len(t[0]) > 1): #definiendum_definition(t[0], text, listterms) pass return (text)
def text_2_triple_list(text, strength): nlp = spacy.load("en") neuralcoref.add_to_pipe(nlp) api = CoreNLPParser(url='http://39.98.186.125:9000') api.parser_annotator = "tokenize,ssplit,coref,openie" parser = CoreNLPParser(url='http://39.98.186.125:9000') text = clean_text(text) text = remove_adjective_possessive_pronoun(text) doc = nlp(text) text = doc._.coref_resolved entities = [] entities_labels = [] for e in doc.ents: if e.label_ in supported_entity_types: entities.append(e.text) entities_labels.append(e.label_) json_text = api.api_call(text) openie_sentences = ssplit_article_into_sentences(json_text, step=-1) syntax_sentences = ssplit_article_into_sentences(json_text, step=1) triples = [] for sentence in openie_sentences: json_sentence = api.api_call(sentence) triples += extract_triples_by_openie(json_sentence) syntax_triples = [] for sentence in syntax_sentences: syntax_tree = list(parser.raw_parse(sentence))[0] cur_syntax_triples = [] parse_tree_v2(syntax_tree, cur_syntax_triples) syntax_triples += cur_syntax_triples triples = filter_triples_by_entities(triples, entities, strength) triples = beautify_triples(triples) triples = remove_meaningless_triples(triples) triples = check_triples_by_pos(triples) triples = remove_duplicate_triples(triples) syntax_triples = beautify_triples(syntax_triples) syntax_triples = remove_meaningless_triples(syntax_triples) syntax_triples = check_triples_by_pos(syntax_triples) syntax_triples = remove_duplicate_triples(syntax_triples) triples = normalize_entities(triples, syntax_triples) triples = remove_duplicate_triples(triples) return generate_structured_triples(triples, entities, entities_labels)
def pruebas(): pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos') tag = pos_tagger.tag( 'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente' .split(' ')) doc = nlp('considerará') lemlist = [tok.lemma_ for tok in doc] print(lemlist)
def __init__(self, host='services.loadbalancer.api.questo.ai', port=9000, separator='|'): # self.parser = CoreNLPParser(url=f'http://{host}:{port}') self.parser = CoreNLPParser( url='http://services.loadbalancer.api.questo.ai:9000') self.separator = separator
def scorer(title, speaker): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 return -s + abs(title[1] - speaker[1]) * 0.3
def __init__(self, config): self.ontology_tagging = OntologyTagging() self.config = config self.word_dictionary = self.compute_all_embeddings() self.server_url = 'http://localhost:9000' self.parser = CoreNLPParser(url=self.server_url) self.core_nlp_dependency_parser = CoreNLPDependencyParser( url=self.server_url)
def clear_data(self): self.parser = CoreNLPParser(url='http://localhost:9000') self.first_NP = '' self.first_VP = '' self.parse_tree = None self.subject = RDF_Triple.RDF_SOP('subject') self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB') self.Object = RDF_Triple.RDF_SOP('object')
def parse(text): parser = CoreNLPParser("http://localhost:9000") result = parser.raw_parse(text.lower()) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def verb_stats(data): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') verb_count = 0 for _, value in data.items(): pos = list(pos_tagger.tag(value.split())) for _, second in pos: if second.startswith("V"): verb_count += 1 print(verb_count)
def parse(text): parser = CoreNLPParser(CORENLP_SERVER) result = parser.raw_parse(text) trees = [tree for tree in result] for tree in trees: tree.chomsky_normal_form() tree.collapse_unary(collapseRoot=True, collapsePOS=True) trees = [ParentedTree.convert(tree) for tree in trees] return trees
def syntax_tree_parser(self): """ get syntax tree :return: syntax tree """ if self.syntax_tree is not None: return self.syntax_tree parser = CoreNLPParser(url='http://localhost:8999') self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0] return self.syntax_tree
def get_probable_title(titles): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') score = [] for title in titles: pos_tag_list = list(pos_tagger.tag(title[0].split())) s = 0 for i in pos_tag_list: if 'NN' in i[1] or 'NP' in i[1]: s += 1 score.append((s, len(title[0]), title[0], title[1])) return max(score)
def add_pos_tag(self, words, tech_pair): tagged_words = CoreNLPParser(url='http://localhost:9000', tagtype='pos').tag(words) # print tagged_words tag_list = [] for (word, tag) in tagged_words: if word in tech_pair.split("\t"): tag_list.append("TECH") else: tag_list.append(tag) return tag_list
def __init__(self): """ Initialize the SVO Methods """ self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"] self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] self.adjective_types = ["JJ", "JJR", "JJS"] self.pred_verb_phrase_siblings = None jar = r'D:\data' model = r'your_path/stanford-postagger-full-2016-10-31/models/english-left3words-distsim.tagger' self.parser = CoreNLPParser(url='http://localhost:9000') self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def get_speaker_salutaion(text, persons, speakers): pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') text = text.split() pos_tag_list = list(pos_tagger.tag(text)) for i in range(len(pos_tag_list) - 1): if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons: if pos_tag_list[i + 1] in persons: for speak in speakers: if pos_tag_list[i + 1] in speak: speak = pos_tag_list[i] + ' ' + speak break return speakers
def run(): rule = TimeExpressions() parser = CoreNLPParser(url='http://localhost:9000') input = "They will play against a team of North Koreans on Wednesday , which is believed to be Kim 's birthday ." exp_output = "They will play against a team of North Koreans ." one_test(rule, parser, input, exp_output) input = "The pitch was effective ." one_test(rule, parser, input) input = "Patti Davis , a former first daughter , wrote a public letter to Malia and Sasha on Sunday ." exp_output = "Patti Davis , a former first daughter , wrote a public letter to Malia and Sasha ." one_test(rule, parser, input, exp_output)
def get_stanford_pos_tags(line): """ Get part of speech tags using the Stanford POS tagger """ st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos") tokenized_line = cnf.TOKENIZER.tokenize(line) line_tagged_initial = st_pos.tag(tokenized_line) line_tagged_output = [] for item in line_tagged_initial: line_tagged_output.append((item[0], item[1])) return line_tagged_output
def createGrammar(self, userMessages, ctx): parser = CoreNLPParser(url='http://localhost:9000') parse_trees = [] for message in userMessages: tokenized = nltk.sent_tokenize(message) for sentence in tokenized: parse_trees.append(list(parser.raw_parse(sentence))[0]) grammar_rules = set() for tree in parse_trees: for production in tree.productions(): grammar_rules.add(production) start = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(start, grammar_rules) return (' '.join((self.generate_sentence(grammar))))
def __init__(self, tool, file): self.t_name = tool self.t_synset = wn.synset(self.t_name) self.hypernyms = [] self.level = 0 self.file = file self.parser = CoreNLPParser(url='http://localhost:9000') self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000') self.telic_roles = [] self.first_words = []
def get_entities(tweets): entities = [] for atweet in tweets: for sent in atweet: pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos') sent = list(pos_tagger.tag(normalize(sent))) # sent = pos_tag(normalize(sent)) trees = ne_chunk(sent) for tree in trees: if hasattr(tree, 'label'): if tree.label() in labels: entities.append(' '.join( [child[0].lower() for child in tree])) return entities
def run(): rule = PreposedAdjuncts() parser = CoreNLPParser(url='http://localhost:9000') input = "The pitch was effective ." one_test(rule, parser, input) input = "According to a now finalized blueprint described by U.S. officials and other sources, " \ "the Bush administration plans to take complete, unilateral control of a post-Saddam Hussein Iraq." exp_output = "the Bush administration plans to take complete , unilateral control of a post-Saddam Hussein Iraq ." one_test(rule, parser, input, exp_output) input = "While never forgetting her Chinese roots , Yee has grown to love America over the years ." exp_output = "Yee has grown to love America over the years ." one_test(rule, parser, input, exp_output)