def tokenize(self, text): """ 将text输入self.corenlp句柄 :return: Tokens,Tokens中的data包括多个(TEXT, TEXT_WS, SPAN, POS, LEMMA, NER) """ # logger.info(text[0:10] + "..." if len(text) > 10 else text) text = text.replace('\n', '\t') output = self.nlp.annotate(text, properties=self.props) """ 有效输出: { "sentences": [ { "index": 0, "entitymentions": [], "tokens": [ { "index": 1, "word": "hello", "originalText": "hello", "lemma": "hello", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "UH", "ner": "O", "before": "", "after": " " }, ] } ] }""" try: output = json.loads(output) except: logger.info( "ERROR in Tokenizer: %s\noutput: %s" % ((text[0:100] + "..." if len(text) > 100 else text), output)) if not self.nlp: self.close() self.nlp = None self.nlp = StanfordCoreNLP(self.classpath, memory=self.heap, lang=self.language, timeout=self.timeout) return None data = [] tokens = [t for s in output['sentences'] for t in s['tokens']] for i in range(len(tokens)): # 获得 单词 及 其后的空白符(如果有的话) start_whitespace = tokens[i]['characterOffsetBegin'] if i + 1 < len(tokens): end_whitespace = tokens[i + 1]['characterOffsetBegin'] else: end_whitespace = tokens[i]['characterOffsetEnd'] data.append( (special_char(tokens[i]['word']), text[start_whitespace:end_whitespace], (tokens[i]['characterOffsetBegin'], tokens[i]['characterOffsetEnd']), tokens[i].get('pos', None), tokens[i].get('lemma', None), tokens[i].get('ner', None))) return Tokens(data, self.annotators)
def getParse(sentence) -> str: # Preset nlp = StanfordCoreNLP('stanford-corenlp-4.2.0/', memory='8g') cc = OpenCC('t2s') # sentence = 'Those two splendid old electric trains.' print( "##################################################################################" ) # # POS print('POS:', nlp.pos_tag(sentence)) print( "##################################################################################" ) # # Tokenize print('Tokenize:', nlp.word_tokenize(sentence)) print( "##################################################################################" ) # # NER print('NER:', nlp.ner(sentence)) print( "##################################################################################" ) # Parser tree = nlp.parse(sentence) parse_string = ' '.join(str(tree).split()) print(parse_string) # ParserTest print('Parser:') print(nlp.parse(sentence)) print( "##################################################################################" ) #TREE Graph tagged = pos_tag(word_tokenize(sentence)) # Extract all parts of speech from any text chunker = RegexpParser(""" NP: {<DT>?<JJ>*<NN>} #To extract Noun Phrases P: {<IN>} #To extract Prepositions V: {<V.*>} #To extract Verbs PP: {<P> <NP>} #To extract Prepostional Phrases VP: {<V> <NP|PP>*} #To extarct Verb Phrases """) # Print all parts of speech in above sentence output = chunker.parse(tagged) print("After Extracting\n", output) # To draw the parse tree output.draw() print( "##################################################################################" ) # Close Stanford Parser nlp.close() return str(parse_string)
中英文分词: StanfordTokenizer 中英文词性标注: StanfordPOSTagger 中英文命名实体识别: StanfordNERTagger 中英文句法分析: StanfordParser 中英文依存句法分析: StanfordDependencyParser, StanfordNeuralDependencyParser """ from preprocessing import preprocess_string from preprocessing import strip_numeric, remove_stopwords, strip_punctuation, tokenize from timeit import default_timer from stanfordcorenlp import StanfordCoreNLP begin = default_timer() str_test = u'''云南铜业股份有限公司(深交所:000878),简称云铜股份、云铜,前身为云南冶炼厂,成立于1958年,1998年改制为股份公司,更名为现称,1998年6月2日于深圳证券交易所上市。公司是中国第四大铜业企业,生产高纯阴极铜、电工用铜线坏、工业硫酸、金锭、银锭、电工用圆铜线、硫酸铜等主产品,并能综合回收金、银、铝、铋、铂、钯等多种有色金属。2007年10月,中国铝业收购云铜母公司云南铜业集团的49%股权,改名“中铝云南铜业集团”。''' filter_setting = [tokenize, strip_punctuation] text = preprocess_string(str_test, filter_setting) nlp = StanfordCoreNLP('/home/weiwu/tools/stanford-corenlp-full-2017-06-09/', lang='zh') tokenize = nlp.word_tokenize(str_test) pos_tag = nlp.pos_tag(str_test) ner = nlp.ner(str_test) parse = nlp.parse(str_test) depend = nlp.dependency_parse(str_test) end = default_timer() from stanfordcorenlp import StanfordCoreNLP import logging import json from collections import defaultdict class StanfordNLP: def __init__(self, host='http://localhost', port=9000):
# stanfordcorenlp by Lynten Guo. A Python wrapper to Stanford CoreNLP server, version 3.9.1. # PyPI page: pip install stanfordcorenlp # Simple usage from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP( r'D:\samli_202010\CoreNLP\CoreNLP\stanford-corenlp-4.1.0') # nlp = StanfordCoreNLP('http://localhost', port=9000) # Debug the wrapper # nlp = StanfordCoreNLP(r'path_or_host', logging_level=logging.DEBUG) # Check more info from the CoreNLP Server # nlp = StanfordCoreNLP(r'path_or_host', quiet=False, # logging_level=logging.DEBUG) sentence = 'I go to aist in Tokyo everyday.Tokyo is the capital city of Japan.' print('Tokenize:', nlp.word_tokenize(sentence)) print('Part of Speech:', nlp.pos_tag(sentence)) print('Named Entities:', nlp.ner(sentence)) # print('Constituency Parsing:', nlp.parse(sentence)) print('Dependency Parsing:', nlp.dependency_parse(sentence)) # Do not forget to close! The backend server will consume a lot memery. nlp.close()
from stanfordcorenlp import StanfordCoreNLP import teasting import re import gensim.downloader as api import time import open_traning_data start = time.time() word_vectors = api.load("glove-wiki-gigaword-100") teasts = teasting.teast() data = open_traning_data.open_data() nlp = StanfordCoreNLP(r'./stanford-corenlp-full-2018-10-05') sentece_we_got_wong = [] for test_time in range(4927): data = teasts.full_teast() sentence1 = (data[0]) sentence2 = (data[1]) #full stop remover if sentence1[-1] == ".": sentence1 = sentence1[0:-2] if sentence2[-1] == ".": sentence2 = sentence2[0:-2]
from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP(r'F:\stanford-corenlp-full-2018-10-05') sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.' print('Tokenize:', nlp.word_tokenize(sentence)) print('Part of Speech:', nlp.pos_tag(sentence)) print('Named Entities:', nlp.ner(sentence)) print('Constituency Parsing:', nlp.parse(sentence)) print('Dependency Parsing:', nlp.dependency_parse(sentence)) nlp.close()
rf.write(json.dumps(relation_id)) print("count_re: ", count_re, "\t count_na: ", count_na, "\t count_total: ", count_re + count_na) print("total_sentence_used: ", total_sentence_used) total_len = count_re + count_na if count_re + count_na < args.max_sentence else args.max_sentence train_list = RES_list[:int(0.8 * total_len)] test_list = RES_list[int(0.8 * total_len) + 1:] json.dump(train_list, trf) json.dump(test_list, tef) def linecount(file_path): count = -1 for count, line in enumerate(open(file_path, 'r', encoding='utf-8')): pass return count + 1 if __name__ == '__main__': # use StanfordCoreNLP to tag ner nlp = StanfordCoreNLP(args.stanford_path, lang='zh', logging_level=logging.WARNING) # use jieba to seg sentence # jieba.load_userdict(args.jieba_dict) clean_sql_output(args.raw_sql_input, args.raw_sql_output) build_entity_relation(args.raw_sql_output, args.train_file, args.test_file, args.disambi_attr_title)
class StanfordParser(OieParser): def __init__(self): self.nlp = StanfordCoreNLP('/home/xliucr/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='4g') # self.nlp = StanfordCoreNLP(r'/Users/Sean/Workspace/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='2g') self.wnl = WordNetLemmatizer() def _get_path(self, index, dependency_parse): path = [] root_index = dependency_parse[0][2] # begin from 1 index += 1 if index == root_index: return path while index != root_index: if index > root_index: index -= 1 path.append(dependency_parse[index][0]) index = dependency_parse[index][1] return path def parse(self, sentence_entity): try: _id, sentence, entities = sentence_entity sentence = sentence.strip() entities_replace = [] sentence_replace = sentence for i in range(len(entities)): e = entities[i].replace('_', ' ').title() entities_replace.append((len(e), e)) for i, e in enumerate(sorted(entities_replace, reverse=True)): sentence_replace = sentence_replace.replace(entities[i], e[1]) if not isinstance(_id, str): _id = 'id_' + str(_id) # entity ner = self.nlp.ner(sentence_replace) ner_indice = [-1] * len(entities_replace) for j, x in enumerate(ner): for i, e in enumerate(entities_replace): if ner_indice[i] == -1 and e[1].startswith(x[0]): ner_indice[i] = j for index in ner_indice: if index == -1: return [] entitiy_types = '-'.join(map(lambda x: (ner[x][1]), ner_indice)) # tag tag = ' '.join(map(lambda x: x[1], self.nlp.pos_tag(sentence_replace))) # lexicalized dependency path dependency_parse = self.nlp.dependency_parse(sentence_replace) for i in range(1, len(dependency_parse)): # if dependency_parse[i][0] == 'ROOT': if dependency_parse[i][1] == 0: return [] left_path = self._get_path(ner_indice[0], dependency_parse) right_path = self._get_path(ner_indice[1], dependency_parse) # trigger root_index = dependency_parse[0][2] - 1 root = self.wnl.lemmatize(self.wnl.lemmatize(ner[root_index][0]), 'v') trigger = 'TRIGGER:%s' %(root) dependency_path = '' for x in left_path: dependency_path += '<-' + x dependency_path += '<-' + root + '->' for x in right_path[::-1]: dependency_path += x + '->' return dependency_path, entities[0], entities[1], entitiy_types, trigger, _id, sentence, tag except: return [] def shutdown(self): self.nlp.close()
output_word += "'s" # add the possessive morpheme output_word += token['after'] print(output_word, end='') text = "Tom and Jane are good friends. They are cool. He knows a lot of things and so does she. His car is red, but " \ "hers is blue. It is older than hers. The big cat ate its dinner." text0 = 'Barack Obama was born in Hawaii. He is the president. Obama was elected in 2008.' text2 = "The music was so loud that it couldn\'t be enjoyed." \ "Our neighbors dislike the music. If they are angry, the cops will show up soon." \ "If they are angry about the music, the neighbors will call the cops." \ "Despite heri difficulty, Wilmai came to understand the point." nlp = StanfordCoreNLP('/home/polo/Downloads/stanford-corenlp-full-2018-10-05/', quiet=False) props = {'annotators': 'dcoref', 'pipelineLanguage': 'en'} output = json.loads(nlp.annotate(text, properties=props)) #output = nlp.annotate(text, properties= {'annotators':'dcoref','outputFormat':'json','ner.useSUTime':'false'}) resolve(output) print('Original:', text) print('_________________________________________') print('Resolved: ', end='') print_resolved(output) nlp.close() #draw()
import operator print(sorted(queryType.items(), key=operator.itemgetter(1))) print(sum) top20howMany = {} top20howMany = collections.Counter(howMany).most_common(20) # print(top20howMany) newdict = {} for obj in top20howMany: newdict[obj[0]] = obj[1] # print(obj[1][0]) nlp = StanfordCoreNLP('http://corenlp.run', port=80) tags = [] parse = [] dependency = [] for key in newdict.keys(): tags.append((nlp.pos_tag(newdict[key][0]))) parse.append(nlp.parse(newdict[key][0])) dependency.append((nlp.dependency_parse(newdict[key][0]))) print('tag') # print(tags[1]) # print(parse[1]) # print(dependency[1]) # print(howMany['1-1037590-1']) for i in range(len(tags)):
#encoding=utf8 import os,gc,re,sys from stanfordcorenlp import StanfordCoreNLP stanford_nlp = StanfordCoreNLP("/home/kuo/NLP/module"+os.sep+'stanfordnlp', lang='zh') def ner_stanford(raw_sentence,return_list=True): if len(raw_sentence.strip())>0: return stanford_nlp.ner(raw_sentence) if return_list else iter(stanford_nlp.ner(raw_sentence)) def cut_stanford(raw_sentence,return_list=True): if len(raw_sentence.strip())>0: return stanford_nlp.pos_tag(raw_sentence) if return_list else iter(stanford_nlp.pos_tag(raw_sentence))
# For Testing Purposes. ### DO NOT MAKE CHANGES TO TESTING VARIABLE HERE ### ### ONLY MAKE CHANGES FROM COMMAND-LINE OPTIONS ### DEBUG = False # True/False. # Get directory of executable files are located in relative to python file. execDir = os.path.dirname(os.path.realpath(__file__)) # Variable to hold training data location. Can change via command-line parameter. trainingDataDir = os.path.join(execDir, "TrainingData") # Setup a Solr instance. The timeout is optional. solr = pysolr.Solr('http://localhost:8983/solr/part3core', timeout=10) # Setup a StanfordCoreNLP instance to get the head word. nlp = StanfordCoreNLP('http://localhost', port=9000) # Flags for query variables and their scores. # Higher scores put more weight on those search terms. # Default all to 1 for now. # Can use these flags to automate testing later. SENTENCE_FLAG = True SENTENCE_WEIGHT = 1 LEMMA_FLAG = True LEMMA_WEIGHT = 1 STEM_FLAG = True STEM_WEIGHT = 1 POSTAG_FLAG = True POSTAG_WEIGHT = 1 HEADWORD_FLAG = True HEADWORD_WEIGHT = 1
def get(url, country): import re import operator from py2neo import Graph, Node, Relationship from stanfordcorenlp import StanfordCoreNLP import urllib.request, urllib.error, urllib.parse import json import os from pprint import pprint try: from urllib import request except: from urllib2 import urlopen as request from urllib2 import Request from bs4 import BeautifulSoup #open a graph database graph = Graph("http://*****:*****@id']}") continue if class_details["@id"] not in id_set: id_set.append(class_details["@id"]) label_set.append(class_details["prefLabel"]) if result["hierarchy"]: print("\n\tHierarchy annotations") for annotation in result["hierarchy"]: try: class_details = get_json(annotation["annotatedClass"]["links"]["self"]) except urllib.error.HTTPError: print(f"Error retrieving {annotation['annotatedClass']['@id']}") continue pref_label = class_details["prefLabel"] or "no label" print("\t\tClass details") print("\t\t\tid: " + class_details["@id"]) print("\t\t\tprefLabel: " + class_details["prefLabel"]) print("\t\t\tontology: " + class_details["links"]["ontology"]) print("\t\t\tdistance from originally annotated class: " + str(annotation["distance"])) HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' } m = request.urlopen(request.Request(url, headers=HEADERS)).read() s = BeautifulSoup(m, "html.parser") metadata = s.findAll("p", attrs={"class":"bodytext"}) n=len(metadata) #find official name official_name = "" for nn in range(1,n): if len(metadata[nn].get_text().split()) < 3: nx=nn break else: official_name += metadata[nn].get_text() print(country + "--" + official_name) print("processing...") ##create ontology official_name_onto = Node("dietary guidelines", name=official_name, area = country) graph.create(official_name_onto) ##nlp nation_set=[] res = stanford_model.ner(official_name) ##"n_t" = a temporary used number for n_t in range(0,len(res)): id_set=[] label_set=[] if res[n_t][1]=="NATIONALITY": if res[n_t][0] not in nation_set: nation_set.append(res[n_t][0]) text_to_annotate = res[n_t][0] annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(text_to_annotate)+"&ontologies=NCIT,MESH") print_annotations(annotations) Nation_onto=Node("nation", name=res[n_t][0]) relation = Relationship(official_name_onto, "nation/language", Nation_onto) graph.create(relation) for nn in range(0,len(id_set)): identifier=Node("identifier", url=id_set[nn]) relation = Relationship(Nation_onto, "identifier", identifier) graph.create(relation) #find all publication years publication_year = [] for nn in range(nx+1,n): if len(metadata[nn].get_text().split()) < 4: nx=nn break else: publication_year.append(metadata[nn].get_text()) publication_year_para_onto = Node("description", content = publication_year) relation = Relationship(official_name_onto, "publication year", publication_year_para_onto) graph.create(relation) res = stanford_model.ner(str(publication_year)) date_set=[] for n_t in range(0,len(res)): if res[n_t][1]=="DATE": if res[n_t][0] not in nation_set: date_set.append(res[n_t][0]) date_set = re.findall('\d+', str(date_set)) date_set = sorted(date_set, reverse=False) for date in date_set: publication_year_onto=Node("value", name = date, value = date, unit= "year") relation = Relationship(publication_year_para_onto, "has value", publication_year_onto) graph.create(relation) #calculation publication frequency if len(date_set) > 1: frequency=(int(date_set[len(date_set)-1]) - int(date_set[0]))/(len(date_set)-1) frequency_onto=Node("publication frequency", name="each " + str(int(frequency))+" years", value=int(frequency), unit="year") relation = Relationship(official_name_onto, "publication frequency", frequency_onto) graph.create(relation) #find stakeholders stakeholders = [] for nn in range(nx+1,n): if len(metadata[nn].get_text().split()) < 3: nx=nn break else: stakeholders.append(str(metadata[nn].get_text())) stakeholders_para_onto = Node("description", content = stakeholders) relation = Relationship(official_name_onto, "stakeholders", stakeholders_para_onto) graph.create(relation) res = stanford_model.ner(str(stakeholders)) organization = "" organization_set = [] for n_t in range(0,len(res)): if res[n_t][1]=="ORGANIZATION": organization += " " + res[n_t][0] else: if organization != "": organization_set.append(str(organization)) organization = "" organization_set_clean=[] for organization in organization_set: if organization not in organization_set_clean: organization_set_clean.append(organization) for organization in organization_set_clean: stakeholder_onto = Node("stakeholder", name = organization) relation = Relationship(stakeholders_para_onto, "has value", stakeholder_onto) graph.create(relation) #find audiance audience = [] for nn in range(nx+1,n): nx=nn if len(metadata[nn].get_text().split()) < 3: break else: audience.append(metadata[nn].get_text()) audience_onto = Node("audience", content = audience) relation = Relationship(official_name_onto, "audience", audience_onto) graph.create(relation) res = stanford_model.ner(str(audience)) age="" age_set=[] ##nlp for getting age for n_t in range(0,len(res)): if res[n_t][1]=="DURATION": age += " " + res[n_t][0] else: if age != "": age_set.append(age) age = "" age_set_clean=[] for age in age_set: if age not in age_set_clean: age_set_clean.append(age) for age in age_set_clean: age_onto = Node("age", name = "over"+age) relation = Relationship(audience_onto, "age", age_onto) graph.create(relation) #find food guide food_guide=[] for nn in range(nx+1,n): nx=nn if len(metadata[nn].get_text().split()) < 3: break else: food_guide.append(metadata[nn].get_text()) food_guide_onto = Node("food guide", name="food guide", content = food_guide) relation = Relationship(official_name_onto, "food guide", food_guide_onto) graph.create(relation) ##add identifiers id_set = [] annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(str(food_guide))+"&ontologies=FOODON") print_annotations(annotations) for nn in range(0,len(id_set)): identifier=Node("identifier", url=id_set[nn]) relation = Relationship(food_guide_onto, "identifier", identifier) graph.create(relation) #find food guidelines(messages) guidelines = s.findAll('ul') n_max = len(guidelines)-6 for n in range(2, n_max): for li in guidelines[n].findAll('li'): guideline = Node("message", content=li.get_text()) relation = Relationship(official_name_onto, "message", guideline) graph.create(relation) id_set = [] label_set = [] annotations = get_json(REST_URL + "/annotator?text=" + urllib.parse.quote(str(li.get_text()))+"&ontologies=FOODON") print_annotations(annotations) if len(id_set) != 0: for nn in range(0,len(id_set)): food_onto = Node("food", name=label_set[nn], url=id_set[nn]) relation = Relationship(guideline, "has value", food_onto) graph.create(relation) identifier=Node("identifier", url=id_set[nn]) relation = Relationship(food_onto, "identifier", identifier) graph.create(relation) print("done") print("")
# -*- coding: utf-8 -*- """ Created on Wed Nov 7 21:31:36 2018 @author: User """ import json import io io = io.StringIO('["streaming API"]') json.loads(ann) import json from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP('http://localhost', port=9000) props = {'annotators': 'coref', 'pipelineLanguage': 'en'} text = 'Barack Obama was born in Hawaii . He is the president . Obama was elected in 2008 .' result = json.loads(nlp.annotate(text, properties=props)) num, mentions = list(result['corefs'].items())[0] for mention in mentions: print(mention) props = {'annotators': 'coref', 'pipelineLanguage': 'en'} nlp = StanfordCoreNLP('http://localhost', port=9000) i = 0 j = 0 mentionsList = []
# Simple usage from stanfordcorenlp import StanfordCoreNLP #nlp = StanfordCoreNLP(r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/') #nlp = StanfordCoreNLP(r'/Users/luisvalencia/Projects/Hackaton/CoreNLP/stanford-corenlp-full-2017-06-09/') nlp = StanfordCoreNLP('http://localhost', port=9001, lang="es") sentence = 'El perro de San Roque no tiene rabo' #'Guangdong University of Foreign Studies is located in Guangzhou.' ''' print ('Tokenize:', nlp.word_tokenize(sentence)) print ('Part of Speech:', nlp.pos_tag(sentence)) print ('Named Entities:', nlp.ner(sentence)) print ('Constituency Parsing:', nlp.parse(sentence)) print ('Dependency Parsing:', nlp.dependency_parse(sentence)) '''
def __init__(self): self.nlp = StanfordCoreNLP('/home/xliucr/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='4g') # self.nlp = StanfordCoreNLP(r'/Users/Sean/Workspace/stanford-corenlp/stanford-corenlp-full-2018-02-27', memory='2g') self.wnl = WordNetLemmatizer()
from stanfordcorenlp import StanfordCoreNLP import os import yaml import re import json dir_path = os.getcwd() host = "http://192.168.12.39" port = 9000 neg_file = "/home/ankit/NLP/API_NLP/Sentiment/negative-words.txt" posi_file = "/home/ankit/NLP/API_NLP/Sentiment/positive-words.txt" nlp = StanfordCoreNLP(host, port=port, timeout=300000000000) def extractNounsAndAdj(Text): tagged = nlp.pos_tag(Text) return tagged def tenGram(Text): pos = [] senti_pos = {} typesOfNouns = ['NN', 'NNS', 'NNP', 'NNPS'] ls = extractNounsAndAdj(Text) for ele in range(0, len(ls)): if ele == 0 and typesOfNouns.__contains__(ls[ele][1]): for position in range(0, 5): if ls[position][1] == 'JJ' or ls[position][1] == 'JJ' or ls[ position][1] == 'JJS': pos.append((ls[ele][0], ls[position][0], position)) senti_pos[ls[0][0]] = pos
from keras import backend as K from GCN import * import re,os from stanfordcorenlp import StanfordCoreNLP import numpy as np import tensorflow as tf os.environ['CUDA_VISIBLE_DEVICES']='3' os.environ["TF_CPP_MIN_LOG_LEVEL"]='3' nlp = StanfordCoreNLP(r'../stanford-corenlp-full-2017-06-09/') sentence = 'which you step on to activate it' de = nlp.dependency_parse(sentence) print ('Dependency Parsing:', de) dep_sentences = [] for i in range(10): dep_sentences.append(de) #依存关系标签 _DEP_LABELS = ['ROOT', 'DOBJ','ADV', 'ADV-GAP', 'AMOD', 'APPO', 'BNF', 'CONJ', 'COORD', 'DEP', 'DEP-GAP', 'DIR', 'DIR-GAP', 'DIR-OPRD', 'DIR-PRD', 'DTV', 'EXT', 'EXT-GAP', 'EXTR', 'GAP-LGS', 'GAP-LOC', 'GAP-LOC-PRD', 'GAP-MNR', 'GAP-NMOD', 'GAP-OBJ', 'GAP-OPRD', 'GAP-PMOD', 'GAP-PRD', 'GAP-PRP', 'GAP-SBJ', 'GAP-TMP', 'GAP-VC', 'HMOD', 'HYPH', 'IM', 'LGS', 'LOC', 'LOC-OPRD', 'LOC-PRD', 'LOC-TMP', 'MNR', 'MNR-PRD', 'MNR-TMP', 'NAME', 'NMOD', 'NSUBJ','OBJ', 'OPRD', 'P', 'PMOD', 'POSTHON', 'PRD', 'PRD-PRP', 'PRD-TMP', 'PRN', 'PRP', 'PRT', 'PUT', 'SBJ', 'SUB', 'SUFFIX',
from scipy import spatial from nltk.tree import * import nltk.corpus import nltk.tokenize.punkt import nltk.stem.snowball import string from multiprocessing import Pool from nltk.draw.tree import TreeView from fuzzywuzzy import fuzz from multiprocessing import Pool from collections import Counter public = '/home/users2/mehrotsh/scripts/packages/stanford-corenlp-full-2018-02-27/' personal = '/home/samarth/stanford-corenlp-full-2018-02-27/' nlp = StanfordCoreNLP(public) #################################################### Functions ############################################################################ def tree(): return defaultdict(tree) def _leadingSpaces_(target): return len(target) - len(target.lstrip()) def _findParent_(curIndent, parid, treeRef): tmpid = parid while (curIndent <= treeRef[tmpid]['indent']):
class ParseTree: def __init__(self, text): self.text = text # 传入的文本 # 定义模型 self.nlp = StanfordCoreNLP(r'E:/py/stanford-corenlp-4.2.0', lang='zh', quiet=False, logging_level=logging.DEBUG) # 分句 self.sentences = self.preprocess() print(self.sentences) self.stopwords = [] def load_dicts(self): stop = PATH + 'stop1205.txt' self.stopwords = self.dict_load(stop) # def preprocess(self): # """ # 预处理: # 1. 去除换行符、多余的空格、百分号 # 2. 分句,存入列表 # :return:返回句子列表 # """ # sentences = [] # self.text = re.sub('%', '', re.sub(' ', '', re.sub('\xa0\xa0\xa0\r\n', '', self.text))) # start = 0 # for i in range(len(self.text)): # if self.text[i] in ['。', '!', ';', '?', '……']: # sentences.append(self.text[start:i + 1]) # start = i + 1 # return sentences def preprocess(self): """ 把文本处理成摘要句子列表 """ return get_sum(self.text) def tree(self, sentence): sentence = sentence.replace(' ','') print(sentence) res = self.nlp.parse(sentence) # nlp.close() return res def sum_of_heights(self): """ 计算整篇文本的每句话构成的语法分析树的高度之和 :return: 高度之和 """ sumHeights = [] for sentence in self.sentences: sentence.replace('%','') res = self.tree(sentence) # 语法树,是个字符串 sumHeights.append(len(res.split("\r\n"))) return np.sum(sumHeights) def avg_height(self): """ 这篇文章的每句话的语法分析树的平均高度 :return: """ return self.sum_of_heights() / len(self.sentences) def no_less_than_16(self): """ 计算整篇文本的高度不大于16的语法分析树的个数 :return: """ num = 0 for sentence in self.sentences: res = self.tree(sentence) if len(res) >= 16: num += 1 return num def no_less_than_16_percent(self): """ 高度不低于16的语法分析树的比例 :return: """ return self.no_less_than_16() / len(self.sentences) def nodes_sum(self): """ 总节点数 :return: """ node_sums = [] for sentence in self.sentences: res = self.nlp.parse(sentence) result = -1 # 去除root for i in res: if i == '(': result += 1 node_sums.append(result) return np.sum(node_sums) def avg_nodes_sentence(self): """ 每句话的平均节点 :return: """ return self.nodes_sum() / len(self.sentences) def seg_sentence(self, sentence): """ 输入字符串,返回分词后的列表 :param sentence: :return: """ jieba.load_userdict('../词典/userdict.txt') sentence_seged = jieba.cut(sentence.strip()) outstr = '' for word in sentence_seged: if word not in self.stopwords: if word != '\t': outstr += word outstr += " " return outstr.split(' ') def avg_nodes_word(self): """ 每个词的平均节点 :return: """ # 计算有几个词 num = 0 for sentence in self.sentences: sentence = self.seg_sentence(sentence) num += len(sentence) return self.nodes_sum() / num def np_sum(self): """ 计算整篇文章里的名词短语个数 :return: """ num = 0 for sentence in self.sentences: res = self.tree(sentence).split("\r\n") for i in res: if 'NP' in i: num += 1 return num def avg_np(self): """ 语法分析树的平均名词短语个数 :return: """ return self.np_sum() / len(self.sentences) def vp_sum(self): """ 计算整篇文章里的动词短语个数 :return: """ num = 0 for sentence in self.sentences: print(sentence) res = self.tree(sentence).split("\r\n") for i in res: if 'VP' in i: num += 1 return num def avg_vp(self): """ 语法分析树的平均动词短语个数 :return: """ return self.vp_sum() / len(self.sentences) def adjp_sum(self): """ 计算整篇文章里的形容词短语个数 :return: """ num = 0 for sentence in self.sentences: res = self.tree(sentence).split("\r\n") for i in res: if 'ADJP' in i: num += 1 return num def avg_adjp(self): """ 语法分析树的平均形容词短语个数 :return: """ return self.adjp_sum() / len(self.sentences) def get_res(self): res = {} res['sum_height'] = self.sum_of_heights() res['height_16'] = self.no_less_than_16() res['sum_node'] = self.nodes_sum() res['sum_n'] = self.np_sum() res['sum_v'] = self.vp_sum() res['sum_adj'] = self.adjp_sum() res['avg_height'] = self.avg_height() res['16_ratio'] = self.no_less_than_16_percent() res['avg_node'] = self.avg_nodes_sentence() res['word_avg_node'] = self.avg_nodes_word() res['avg_n'] = self.avg_np() res['avg_v'] = self.avg_vp() res['avg_adj'] = self.avg_adjp() return res
class ChatBot: """ Intelligent dialogue model based on- 1. Template-based- AIML 2. Knowledge Based- MySQL \\\ 3. Web Search 4. Deep Learning: RNN """ # initialize colorama.init() ws.load() #nltk.download() def __init__(self, config_file='config.cfg', host='http://localhost', port=9000): config = configparser.ConfigParser() config.read(config_file) self.filter_file = config.get('resource', 'filter_file') self.load_file = config.get('resource', 'load_file') self.save_file = config.get('resource', 'save_file') self.shelve_file = config.get('resource', 'shelve_file') corp_dir = os.path.join(PROJECT_ROOT, 'Data', 'Corpus') knbs_dir = os.path.join(PROJECT_ROOT, 'Data', 'KnowledgeBase') res_dir = os.path.join(PROJECT_ROOT, 'Data', 'Result') # Initialize the KERNEL self.mybot = aiml.Kernel() sess = tf.Session() self.predictor = BotPredictor(sess, corpus_dir=corp_dir, knbase_dir=knbs_dir, result_dir=res_dir, result_file='basic') self.session_id = self.predictor.session_data.add_session() # Create AI Engine if os.path.isfile("model\AIChatEngine.brn"): self.mybot.bootstrap(brainFile="model\AIChatEngine.brn") else: self.mybot.bootstrap(learnFiles=self.load_file, commands='load aiml b') self.mybot.saveBrain("model\AIChatEngine.brn") #Initialization learning library self.template = '<aiml version="1.0" encoding="UTF-8">\n{rule}\n</aiml>' self.category_template = '<category><pattern>{pattern}</pattern><template>{answer}</template></category>' # Initialize Filter sensitive words #self.gfw = filter.DFAFilter() #self.gfw.parse(self.filter_file) # Use an existing server: StanfordCoreNLP self.nlp = StanfordCoreNLP(host, port=port, timeout=30000) self.props = { 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation', 'pipelineLanguage': 'en', 'outputFormat': 'json' } # Initialize the Language Tool for GEC self.tool = language_check.LanguageTool('en-US') # ########################################################### def response(self, user_message): print('# User -->: ' + user_message) # Limit word count if len(user_message) > 200: return self.mybot.respond('MAX') elif len(user_message) < 2: return self.mybot.respond('MIN') # ************************************************** # Filter sensitive words # ************************************************** #message = self.gfw.filter(message, "*") #if message.find("*") != -1: #return self.mybot.respond('FILTER') # ************************************************** # Grammar Error Check and Prompt to User # ************************************************** gec_message = user_message # ************************************************** # Start Conversation # ************************************************** responseAnswer = '' botresponse = self.mybot.respond(gec_message) print('# Bot1 --> ' + botresponse) if botresponse[0] == '@': botresponse = botresponse.replace('@', '') print('# After Confirmation--> ' + botresponse) if gec_message == 'Yes': botresponse = self.mybot.respond(botresponse) else: return self.mybot.respond('ASK NEW QUERY') # Initialize Lemmatization wordnet_lemmatizer = WordNetLemmatizer() # User Sentence Tokenization word_tokens = self.nlp.word_tokenize(botresponse) # Removing stopwords stop_words = set(stopwords.words('english')) #stopwords.extend(string.punctuation) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_stop_words = [] for w in word_tokens: if w not in stop_words: filtered_stop_words.append(w) print( colorama.Fore.RED + '\n------------------ User Input Words --> Lemma -------------------------- ' + colorama.Fore.RESET) final_sentence = [] for word in filtered_stop_words: final_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v")) print("{0:10}{1:5}{2:20}".format( word, '--> ', wordnet_lemmatizer.lemmatize(word, pos="v"))) #print(colorama.Fore.GREEN+'\n********************* Dependency Parser ********************* '+colorama.Fore.RESET) #dependency_parser = self.nlp.dependency_parse(' '.join(final_sentence)) #print(dependency_parser) # POS Tagger postagger = self.nlp.pos_tag(' '.join(final_sentence)) print( colorama.Fore.YELLOW + '\n------------------ Identify POS Tagger -------------------------- ' + colorama.Fore.RESET) print('pos tagger: ', postagger) print( "-----------------------------------------------------------------------" ) grammar = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" cp = nltk.RegexpParser(grammar) #tree = cp.parse(postagger) #print ("CP: ", cp) tree = cp.parse(postagger) print(tree) for word, pos in postagger: if pos == 'NNP': print(word) print( "-----------------------------------------------------------------------" ) #https://github.com/ayat-rashad/ayat-rashad.github.io/blob/master/triples.ipynb # Add all NOUNs into list nounEntityList = [] for pos in postagger: if pos[1] in ('NN', 'NNS', 'NNP', 'NNPS'): nounEntityList.append(pos[0]) print( colorama.Fore.GREEN + '\n------------------ Added NOUN into Entity List ------------------------- ' + colorama.Fore.RESET) print(nounEntityList, '\n') # 1: Template-based Strategy if botresponse[0] != '#': print('Template-based Strategy') responseAnswer = botresponse # 2: KB Searching Strategy elif botresponse.find('#NONE#') != -1: nounEntityList.remove('#NONE') ans = '' #ans = kb.kdd_search(nounEntityList, ' '.join(final_sentence), gec_message) if ans != '': print('KB Searching Strategy') responseAnswer = ans.encode('utf-8') # 3: Internet Retrieval Strategy else: #ans = crawler.web_search(gec_message) if ans != '': print('Internet Retrieval Strategy') responseAnswer = ans.encode('utf-8') # 4: Generative Strategy- RNN else: if gec_message == 'Yes': confirm_mgs = botresponse.replace('#NONE#:', '') ans = deep.neural_network(self, confirm_mgs) print('Generative Strategy with - YES') print(confirm_mgs) else: ans = deep.neural_network(self, gec_message) print('Generative Strategy') responseAnswer = ans.encode('utf-8') # Learning Mode elif result.find('#LEARN#') != -1: question = result[8:] answer = message self.save(question, answer) return self.mybot.respond('Already studied') else: responseAnswer = self.mybot.respond('I don\'t know.') return responseAnswer # Grammar Error Check on Raw User Input def checkGrammarError(self, user_message): print( colorama.Fore.GREEN + '\n------------------ Grammar Error Correction -------------------------- ' + colorama.Fore.RESET) matches = self.tool.check(user_message) gec_user_message = language_check.correct(user_message, matches) if (len(matches) > 0): i = 0 for x in matches: print('Grammatical Error --> ', matches[i]) print('Apply Rules--> ', matches[i].replacements) i = i + 1 else: print('No Error Found.') return gec_user_message # SAVE Model def save(self, question, answer): db = shelve.open(self.shelve_file, 'c', writeback=True) db[question] = answer db.sync() rules = [] for r in db: rules.append(self.category_template.format(pattern=r, answer=db[r])) with open(self.save_file, 'w') as fp: fp.write(self.template.format(rule='\n'.join(rules))) def forget(self): os.remove(self.save_file) if os.path.exists(self.save_file) else None os.remove(self.shelve_file) if os.path.exists( self.shelve_file) else None self.mybot.bootstrap(learnFiles=self.load_file, commands='load aiml b')
import json from collections import Counter from stanfordcorenlp import StanfordCoreNLP from termcolor import colored from tqdm import tqdm from nltk.tokenize.treebank import TreebankWordDetokenizer nlp = StanfordCoreNLP('../span_bert/SpanBERT/stanford-corenlp-full-2018-10-05') # nlp.close() ALL_RELATIONS_TYPES = {'per:title': ['PERSON', 'TITLE'], 'org:top_members/employees': ['ORGANIZATION', 'PERSON'], 'org:country_of_headquarters': ['ORGANIZATION', 'COUNTRY'], 'per:parents': ['PERSON', 'PERSON'], 'per:age': ['PERSON', 'NUMBER'], 'per:countries_of_residence': ['PERSON', 'COUNTRY'], 'per:children': ['PERSON', 'PERSON'], 'org:alternate_names': ['ORGANIZATION', 'ORGANIZATION'], 'per:charges': ['PERSON', 'CRIMINAL_CHARGE'], 'per:cities_of_residence': ['PERSON', 'CITY'], 'per:origin': ['PERSON', 'NATIONALITY'], 'org:founded_by': ['ORGANIZATION', 'PERSON'], 'per:employee_of': ['PERSON', 'ORGANIZATION'], 'per:siblings': ['PERSON', 'PERSON'], 'per:alternate_names': ['PERSON', 'PERSON'], 'org:website': ['ORGANIZATION', 'URL'], 'per:religion': ['PERSON', 'RELIGION'], 'per:stateorprovince_of_death': ['PERSON', 'LOCATION'], 'org:parents': ['ORGANIZATION', 'ORGANIZATION'], 'org:subsidiaries': ['ORGANIZATION', 'ORGANIZATION'], 'per:other_family': ['PERSON', 'PERSON'], 'per:stateorprovinces_of_residence': ['PERSON', 'STATE_OR_PROVINCE'], 'org:members': ['ORGANIZATION', 'ORGANIZATION'], 'per:cause_of_death': ['PERSON', 'CAUSE_OF_DEATH'], 'org:member_of': ['ORGANIZATION', 'LOCATION'], 'org:number_of_employees/members': ['ORGANIZATION', 'NUMBER'], 'per:country_of_birth': ['PERSON', 'COUNTRY'], 'org:shareholders': ['ORGANIZATION', 'ORGANIZATION'], 'org:stateorprovince_of_headquarters': ['ORGANIZATION', 'STATE_OR_PROVINCE'], 'per:city_of_death': ['PERSON', 'CITY'], 'per:date_of_birth': ['PERSON', 'DATE'],
f = open('C:/Users/text/PycharmProjects/fin_network/data/result.txt') text = f.read() # 基于TF-IDF算法进行关键词抽取 jieba.load_userdict('C:/Users/text/PycharmProjects/fin_network/data/newdict.txt') keywords = tfidf(text, topK=50) print "keywords by tfidf:" # 输出抽取出的关键词 for keyword in keywords: print keyword + "/", ''' # from nltk.parse import stanford from stanfordcorenlp import StanfordCoreNLP import uniout import jieba.posseg as pseg nlp = StanfordCoreNLP( r'C:\ProgramData\Anaconda2\stanfordNLP\stanford-corenlp-full-2018-10-05', lang='zh') # 这里改成你stanford-corenlp所在的目录 sentence = '对公司未来业绩造成不利影响' print[(word, flag) for (word, flag) in pseg.cut(sentence)] # print 'Tokenize:', nlp.word_tokenize(sentence) # print 'Part of Speech:', nlp.pos_tag(sentence) # print 'Named Entities:', nlp.ner(sentence) print 'Constituency Parsing:', nlp.parse(sentence) # print 'Dependency Parsing:', nlp.dependency_parse(sentence) nlp.close() # Do not forget to close! The backend server w
def create_nlp_pool(num_threads): return [StanfordCoreNLP('http://localhost', port=9000) for _ in range(num_threads)]
# /usr/bin/python # coding:utf-8 import pickle import re import sys import json import os import argparse from stanfordcorenlp import StanfordCoreNLP from amr_utils import read_json, remove_wiki, read_anonymized, get_concepts from generate_parent_index import gen_par_index_seq if __name__ == '__main__': nlp = StanfordCoreNLP(r'/home/wangante/stanford-corenlp-full-2018-10-05', lang='en') in_file, in_dir, o_file1, o_file2, o_file3, o_file4, o_file5, o_file6, o_file7 = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[ 5], sys.argv[6], sys.argv[7], sys.argv[8], sys.argv[9] origin_amr = {} for file_name in os.listdir(in_dir): with open(os.path.join(in_dir, file_name)) as file: for example in file.read().strip().split('\n\n')[1:]: example = example.split('\n') origin_amr[example[0].split()[2]] = [example[2][len('# ::alignments '):], example[1][len('# ::tok '):].lower(), ' '.join(example[3:])] input_list = read_json(in_file) num_items = len(input_list[0]) regu = re.compile(r'[^:][^\s]*?~e\.\d+') for id, amr in origin_amr.items(): amr=amr[2].split()
class CoreNlpTokenizer(): def __init__(self, **kwargs): """ :arg language: 语言 :arg classpath: corenlp jars的目录 :arg annotators: 一个可能包含'pos', 'lemma', 'ner'的集合 :arg heap: java堆内存 """ self.language = kwargs.get('language', DEFAULTS['tokenizer_language']) self.annotators = copy.deepcopy( kwargs.get('annotators', DEFAULTS['tokenizer_annotators'])) self.classpath = os.path.join( DATA_DIR, kwargs.get('classpath', DEFAULTS['tokenizer_classpath'])) self.heap = kwargs.get('heap', DEFAULTS['tokenizer_heap']) self.timeout = kwargs.get('timeout', DEFAULTS['tokenizer_timeout']) # annotators: tokenize(分词), ssplit(断句), pos(词性标注), lemma(词元化), ner(命名实体识别) annotators = ['tokenize', 'ssplit'] if 'ner' in self.annotators: annotators.extend(['pos', 'lemma', 'ner']) elif 'lemma' in self.annotators: annotators.extend(['pos', 'lemma']) elif 'pos' in self.annotators: annotators.extend(['pos']) annotators = ','.join(annotators) options = ','.join(['untokenizable=noneDelete', 'invertible=true']) self.nlp = StanfordCoreNLP(self.classpath, port=random.randint(9000, 65535), memory=self.heap, lang=self.language, timeout=self.timeout) self.props = { 'timeout': str(self.timeout), 'annotators': annotators, 'pipelineLanguage': 'zh', 'outputFormat': 'json', 'prettyPrint': 'False', 'tokenize.options': options, # 'nthreads': 4 } def tokenize(self, text): """ 将text输入self.corenlp句柄 :return: Tokens,Tokens中的data包括多个(TEXT, TEXT_WS, SPAN, POS, LEMMA, NER) """ # logger.info(text[0:10] + "..." if len(text) > 10 else text) text = text.replace('\n', '\t') output = self.nlp.annotate(text, properties=self.props) """ 有效输出: { "sentences": [ { "index": 0, "entitymentions": [], "tokens": [ { "index": 1, "word": "hello", "originalText": "hello", "lemma": "hello", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "UH", "ner": "O", "before": "", "after": " " }, ] } ] }""" try: output = json.loads(output) except: logger.info( "ERROR in Tokenizer: %s\noutput: %s" % ((text[0:100] + "..." if len(text) > 100 else text), output)) if not self.nlp: self.close() self.nlp = None self.nlp = StanfordCoreNLP(self.classpath, memory=self.heap, lang=self.language, timeout=self.timeout) return None data = [] tokens = [t for s in output['sentences'] for t in s['tokens']] for i in range(len(tokens)): # 获得 单词 及 其后的空白符(如果有的话) start_whitespace = tokens[i]['characterOffsetBegin'] if i + 1 < len(tokens): end_whitespace = tokens[i + 1]['characterOffsetBegin'] else: end_whitespace = tokens[i]['characterOffsetEnd'] data.append( (special_char(tokens[i]['word']), text[start_whitespace:end_whitespace], (tokens[i]['characterOffsetBegin'], tokens[i]['characterOffsetEnd']), tokens[i].get('pos', None), tokens[i].get('lemma', None), tokens[i].get('ner', None))) return Tokens(data, self.annotators) def close(self): self.nlp.close()
import jieba import logging from stanfordcorenlp import StanfordCoreNLP # Start a CoreNLP Remote Server with Terminal command: ''' java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref,quote -port 9000 -timeout 30000 ''' nlp = StanfordCoreNLP('http://localhost', port=9000, lang='zh', logging_level=logging.DEBUG) text_path = 'test_chinese_news.txt' sentence = '' with open(text_path, 'r') as f: for line in f.readlines(): if line.strip(): sentence += line def cut(string): return ' '.join(jieba.cut(string)) cut_sentence = cut(sentence) # print(nlp.word_tokenize(sentence))
def main(): is_it_test = True train = False path_to_word_2_vec = "" path_to_data = "" path_to_nlp = "" path_to_our_model = "" if os.name == "nt": if is_it_test: path_to_word_2_vec = r"E:\FinalProject3\GoogleNews-vectors-negative300.bin" else: path_to_word_2_vec = r"E:\FinalProject3\wiki.en.vec" path_to_data = r"E:\FinalProject3\data" path_to_nlp = r'E:\FinalProject3\stanford-corenlp-full-2018-02-27' path_to_our_model = r'E:\FinalProject3\auto_de_only_wiki' else: if is_it_test: path_to_word_2_vec = "/home/ubuntu/Projet/FinalProject3/GoogleNews-vectors-negative300.bin" else: path_to_word_2_vec = "/home/ubuntu/Projet/FinalProject3/wiki.en.vec" path_to_data = "/home/ubuntu/Projet/FinalProject3/data/" path_to_nlp = "/home/ubuntu/Projet/FinalProject3/stanford-corenlp-full-2018-02-27" path_to_our_model = "/home/ubuntu/Projet/FinalProject3/auto_de_only_wiki" # for debug -->, quiet=False, logging_level=logging.DEBUG) nlp = StanfordCoreNLP(path_to_nlp) if train: dataset = DataClass(path_to_data) dataset.laod_data() else: dataset = DataClass(path_to_data) modelwords = MyWord2vec(path_to_word_2_vec) # ("/home/ubuntu/Project/FinalProject/", "wiki.en.vec") try: word2vec except NameError: var_exists = False else: var_exists = True if not var_exists: modelwords.load_embeddings() try: modelwords.model["check"] word2vec = True except: word2vec = False print('word2vec not configure') preprocessData = PreprocessClass(dataset, modelwords, nlp, "ml", train) preprocessData.getMaxLength() preprocessData.preprocessing_data() if not train: nnmodel = DLClass() nnmodel.model = load_model(path_to_our_model) graph = tf.get_default_graph() return { "preproc": preprocessData, 'nnmodel': nnmodel.model, 'graph': graph } predict( preprocessData, "A wiki is a Web site that allows users to add and update content" " on the site using their own Web browser.", path_to_our_model) if train: preprocessData.X, preprocessData.classified_output = shuffle( preprocessData.X, preprocessData.classified_output, random_state=0) # 1 to save model 10 for statistic result kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) scores = defaultdict(int) nlp.close() if train: for train, test in kfold.split(preprocessData.X, preprocessData.classified_output): nnmodel = DLClass() nnmodel.build_model(preprocessData.X[train], preprocessData.classified_output[train], "cblstm") print('Predicting...') preds = np.array([ i[0] for i in nnmodel.model.predict_classes(preprocessData.X[test]) ]) p = precision(preds, preprocessData.classified_output[test]) r = recall(preds, preprocessData.classified_output[test]) f1 = f1_score(preds, preprocessData.classified_output[test]) print('(Fold) Precision: ', p, ' | Recall: ', r, ' | F: ', f1) scores['Precision'] += p scores['Recall'] += r scores['F1'] += f1 nnmodel.model.save("/home/ubuntu/auto_de_only_wiki") print('Overall scores:') for n, sc in scores.items(): print(n, '-> ', sc / 10 * 1.0)
# _*_coding:utf-8_*_ from __future__ import print_function from stanfordcorenlp import StanfordCoreNLP local_corenlp_path = r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/' # local_corenlp_path = r'/home/gld/JavaLibs/stanford-corenlp-full-2016-10-31' # Simple usage nlp = StanfordCoreNLP(local_corenlp_path) sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.' print('Tokenize:', nlp.word_tokenize(sentence)) print('Part of Speech:', nlp.pos_tag(sentence)) print('Named Entities:', nlp.ner(sentence)) print('Constituency Parsing:', nlp.parse(sentence)) print('Dependency Parsing:', nlp.dependency_parse(sentence)) position, cluster_no, text = nlp.dcorf(sentence) nlp.__del__() # Other human languages support, e.g. Chinese nlp = StanfordCoreNLP(local_corenlp_path, lang='zh', quiet=False) sentence = '清华大学位于北京。' print(nlp.word_tokenize(sentence)) print(nlp.pos_tag(sentence)) print(nlp.ner(sentence)) print(nlp.parse(sentence)) print(nlp.dependency_parse(sentence))
#!/usr/bin/python3 # coding: utf-8 ## run with sudo from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP(r'/Users/coder352/datasets/Lib/stanford-corenlp-full-2018-02-27') sentence = ":) ... 'll 're http://jmlr.org/papers/v15/srivastava14 u.s. http://baidu.com `` '' 's 1,2:3 2018.03.07 2018/03/07 2018-03-07 for 23-years old. pi is 3.1415, .8 0.8%% is good, +0.2 well-known -lrb- mr. mra mrs. no.1 ##-mill-dollar ###.##-### <unk>" print('Tokenize:', nlp.word_tokenize(sentence)); print() print('Part of Speech:', nlp.pos_tag(sentence)); print() # print('Named Entities:', nlp.ner(sentence)) # 这个会报错 print('Constituency Parsing:', nlp.parse(sentence)); print() print('Dependency Parsing:', nlp.dependency_parse(sentence)); print() nlp.close() # Do not forget to close! The backend server will consume a lot memery.
# coding=utf-8 from stanfordcorenlp import StanfordCoreNLP nlp = StanfordCoreNLP(r'D:\NLP_sourceCode\stanfordcorenlp') # 如果所有设置都没有问题还是报错: # 请注意:D:\Anaconda3\Lib\site-packages\stanfordcorenlp\corenlp.py # memory 默认是4g,但是我只有8g,运行剩余不够4g,所以需要改小,但是速度会很慢,所以只有加内存条。 # step.1 启动 server # Run a server using Chinese properties # java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -port 9000 -timeout 15000 # nlp = StanfordCoreNLP('http://localhost', port=9000) sentence = 'Guangdong University of Foreign Studies is located in Guangzhou' # print (nlp.word_tokenize(sentence)) # print (nlp.pos_tag(sentence)) # print (nlp.ner(sentence)) print (nlp.parse(sentence)) # 语法树 # print (nlp.dependency_parse(sentence)) #依存句法 nlp.close() #释放,否则后端服务器将消耗大量内存