def get_test_references(): de, en = [], [] proc = CoreNLP('ssplit') # Insert test references in training data entries = Entry.objects(set='test') for entry in entries: for triple in entry.triples: agent = triple.agent.name patient = triple.patient.name de.append(agent) name = ' '.join( agent.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) de.append(patient) name = ' '.join( patient.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) return de, en
def __parse_text(self): if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)): self.__load_parse_result() return ss = CoreNLP('parse', corenlp_jars = ['~/software/stanford-corenlp-full-2015-12-09/*']) self.parsed = ss.parse_doc(self.sentences) ss.cleanup()
def phrases(): #STOPWORDS is the list of words we'd like to discards in our stopwords =[".","?","!",','] proc = CoreNLP("nerparse",corenlp_jars=[java]) p=[] i=1 print "#### Traitement et mise en forme des questions extraites ####" with open(quest,'r') as inp: for line in inp: print "traitement de la ligne " + str(i) p.append(proc.parse_doc(line)) i+=1 with open('./output/phrases.txt','w') as outp: with open('./output/ressources1.txt','w') as outr: for elmt in p: for tok in elmt["sentences"][0]["lemmas"]: if not tok in stopwords: a =tok print a outr.write(a+'\n'.decode().encode('utf-8')) outr.write('\n'.decode().encode('utf-8')) for tok in elmt["sentences"][0]["tokens"]: if not tok in stopwords: outp.write(tok.decode().encode('utf-8')+'\n'.decode().encode('utf-8')) outp.write('\n'.decode().encode('utf-8'))
class SimpleREG(object): def run(self, fin, fout): self.proc = CoreNLP('ssplit') entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle'))) f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev')) texts = f.read().lower().split('\n') f.close() print len(texts), len(entity_maps) for i, text in enumerate(texts[:-1]): entity_map = entity_maps[i] for tag in entity_map: name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_')) texts[i] = texts[i].replace(tag.lower(), str(name)) f = open(fout, 'w') for text in texts: out = self.proc.parse_doc(text)['sentences'] text = [] for i, snt in enumerate(out): text.extend(snt['tokens']) text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip() f.write(text.encode('utf-8')) f.write('\n') f.close()
def main(arg): dir = os.path.dirname(__file__) filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*') configFileLoc = os.path.join(dir, 'config.ini') proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename]) with open(arg, "r") as file: data = removeHeadings(file) parsed = proc.parse_doc(data) data = [] for s in parsed[u'sentences']: sent = str(' '.join(s[u'tokens'])) data.append(sent.translate(string.maketrans("",""), string.punctuation)) data1 = ".".join(data) data1 = data1.replace("..",".") data1 = data1.replace(" "," ") data1 = data1.replace(" .",". ") data2 = " ".join(data) data2 = data2.replace(" "," ") file_train1 = open("data/a1_train1.txt", "w") file_train1.write(data1) file_train1.close() file_train2 = open("data/a1_train2.txt", "w") file_train2.write(data2) file_train2.close() file_test1 = open("data/a1_test1.txt", "w") file_test1.write(clean1(data1)) file_test1.close() file_test2 = open("data/a1_test2.txt", "w") file_test2.write(clean(data2)) file_test2.close()
def __parse_text(self): if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)): self.__load_parse_result() return ss = CoreNLP( 'parse', corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*']) self.parsed = ss.parse_doc(self.sentences) ss.cleanup()
class StanfordNLP: def __init__(self): #self.server = ServerProxy(JsonRpc20(), # TransportTcpIp(addr=("127.0.0.1", 8080))) corenlp_dir="/usr/local/lib/stanford-corenlp-full-2017-06-09/*" self.server = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner','depparse.model':'edu/stanford/nlp/models/parser/nndep/english_SD.gz'}, corenlp_jars=[corenlp_dir]) def parse(self, text): return self.server.parse_doc(text)
class StanfordPreprocessor(object): def __init__(self, homedir='./'): from stanford_corenlp_pywrapper import CoreNLP self.corenlp = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, parse, ner' }, output_types=['pos', 'lemma', 'parse', 'ner'], corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"]) def parse(self, document): return self.corenlp.parse_doc(document)
class SentenceDelimiter(): def __init__(self, corenlp_path): self.proc = CoreNLP("ssplit", corenlp_jars=[os.path.join(corenlp_path, '*')]) def get_sentences(self, text): res = self.proc.parse_doc(text) for sentence in res['sentences']: sentence_text = ' '.join(sentence['tokens']).encode('utf8') sentence_text = ' '.join(sentence_text.split()) sentence_text = sentence_text.replace('-LRB-', '(').replace('-RRB-', ')') sentence_text = sentence_text.replace('-LSB-', '[').replace('-RSB-', ']') sentence_text = sentence_text.replace('-LCB-', '{').replace('-RCB-', '}') yield escape(sentence_text)
def lemmatize(l): result = [] from stanford_corenlp_pywrapper import CoreNLP proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip') for doc_words in l: single_dict = proc.parse_doc(doc_words) row = [] for each_dict in single_dict['sentences']: for word in each_dict['lemmas']: row.append(word) result.append(row) return result
def write_hyps(hyps, fname): proc = CoreNLP('ssplit') f = open(fname, 'w') for hyp in hyps: out = proc.parse_doc(hyp) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')') text += ' ' f.write(text.encode('utf-8')) f.write('\n') f.close()
class BioNLPEnrichment(BaseEnrichment): """ """ def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"]) def enrichment_value(self,tweet): """ Calculate enrichment value """ rep = self.corenlp.parse_doc(tweet["actor"]["summary"]) return rep def __repr__(self): """ Add a description of the class's function here """ return("Stanford core NLP applied to user bio")
class StanfordNLP: def __init__(self): #self.server = ServerProxy(JsonRpc20(), # TransportTcpIp(addr=("127.0.0.1", 8080))) corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*" self.server = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner', 'depparse.model': 'edu/stanford/nlp/models/parser/nndep/english_SD.gz' }, corenlp_jars=[corenlp_dir]) def parse(self, text): return self.server.parse_doc(text)
class kawata_corenlp_handler: def __init__(self): # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"]) self.proc = CoreNLP( configdict={ 'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse' }, corenlp_jars=[ "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*", "/Users/akira/stanford-corenlp-full-2015-12-09/sutime" ]) def __join_text_date(self, text, date): ''' Join text and date. ''' date_s = dt.date2str(date) return '[<date>{0}</date>]\n{1}'.format(date_s, text) def get_words(self, text, date): n_text = unidecode.unidecode(text) joint_text = self.__join_text_date(n_text, date) joint_text = n_text p = self.proc.parse_doc(joint_text)["sentences"][0] # print p words = list() words = zip(p["ner"], p["tokens"], p["ner"]) stop = stopwords.words("english") words = filter(lambda x: x[1] not in stop, words) words = map(lambda x: (x[0], x[1].lower(), x[2]), words) # I cannot understand what is most suitable in above line. ws = list() w = ("", "", "") for v in words: if v[0] != 'O' and v[0] == w[0]: w = (w[0], w[1] + " " + v[1], w[2]) else: ws.append(w) w = v if w[0] != "": ws.append(w) words = ws return words[1:]
class BodyNLPEnrichment(BaseEnrichment): """ """ def __init__(self): """ Load and initialize any external models or data here """ self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"]) def enrichment_value(self, tweet): """ Calculate enrichment value """ rep = self.corenlp.parse_doc(tweet["body"]) return rep def __repr__(self): """ Add a description of the class's function here """ return "Stanford core NLP applied to tweet body"
def split_and_tokenize(doc): ''' Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP. More info: https://github.com/brendano/stanford_corenlp_pywrapper :param doc: path to the :return: ''' parse_mode = "ssplit" # tokenization and sentence splitting coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*" parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath]) json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json" if not os.path.exists(json_name): doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0] document = codecs.open(doc_path, "r", encoding="utf-8").read() data_source_parse = parser.parse_doc(document) with open(json_name, 'w') as fp: json.dump(data_source_parse, fp, sort_keys=True, indent=2)
def main(): if not os.path.exists(IN_FILE + '_rf'): print('First reformatting file...') out_format = open(IN_FILE + '_rf', 'w') with open(IN_FILE) as handle: for line in tqdm(handle): tline = line.strip() if tline == '': out_format.write('\n') else: out_format.write(tline + ' ') print('Sentence tokenizer!') print('Loading Stanford CoreNLP...') proc = CoreNLP(configdict={ 'annotators': 'tokenize,ssplit', 'tokenize.options': 'ptb3Escaping=False' }, output_types=['tokenize,ssplit'], corenlp_jars=[CORENLP_PATH]) out_file = open(IN_FILE + '_sts', 'w') sentence_count = 0 print('Opening file ' + IN_FILE + '_rf' + '...') with open(IN_FILE + '_rf') as handle: lines = handle.readlines() for line in tqdm(lines): the_text = line.strip() # Use Stanford instead parsed = proc.parse_doc(the_text) sentence_count += len(parsed['sentences']) for sent in parsed['sentences']: the_tokens = [i.replace(' ', '') for i in sent['tokens']] the_sent = ' '.join(the_tokens) assert len(the_sent.split(' ')) == len(sent['tokens']) out_file.write(the_sent.encode('utf-8') + '\n') print('Number of sentences so far: ' + '{:,}'.format(sentence_count)) out_file.close()
class StanfordParser(object): """ Stanford parser """ def __init__(self, corenlp_jars): self.proc = CoreNLP("parse", corenlp_jars=corenlp_jars) def parse(self, text): # {u'sentences': # [ # {u'parse': u'(ROOT (S (VP (NP (INTJ (UH hello)) (NP (NN world)))) (. !)))' # u'tokens': [u'hello', u'world', u'.'], # u'lemmas': [u'hello', u'world', u'.'], # u'pos': [u'UH', u'NN', u'.'], # u'char_offsets': [[0, 5], [6, 11], [11, 12]] # }, # ... # ] # } json_rst = self.proc.parse_doc(text) if json_rst: for sent in json_rst['sentences']: parse_tree= sent['parse'] yield parse_tree
from stanford_corenlp_pywrapper import CoreNLP import os proc = CoreNLP("ner", corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"]) input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset' output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv' #parse files output = open(output_path,'w') for filename in os.listdir(input_path): try: input_file = open(filename,'r') x = input_file.read() out = proc.parse_doc(x) ner_tags = out['sentences'][0]['ner'] num_tokens = len(ner_tags) lemmas = out['sentences'][0]['lemmas'] first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON")) for begin_index in first_indexes: # find the end of the PERSON phrase (consecutive tokens tagged as PERSON) end_index = begin_index + 1 while end_index < num_tokens and ner_tags[end_index] == "PERSON": end_index += 1 end_index -= 1 mention_text = " ".join(map(lambda i: lemmas[i], xrange(begin_index, end_index + 1))) print("%s %s" % (filename, mention_text)) output.write("%s\n" % mention_text) except IndexError: pass
word = dependency[0] start = dependency[1] end = dependency[2] structure = word.encode("utf-8") # Indexing starts at 1, so we add 1 if word == "root": structure = "%s(ROOT-%s, %s-%s)" %(structure,start+1,words[end],end+1) else: structure = "%s(%s-%s, %s-%s)" %(structure,words[start],start+1,words[end],end+1) return structure.encode("utf-8") for p in range(0,len(paragraphs)): paratext = paragraphs[p].replace("<p>","").replace("</p>","").replace("\t"," ").replace('"',"''").replace(",","") sentence_id = "%s@%s" %(article_id,p) try: nlp = proc.parse_doc(paratext) wordslist = nlp["sentences"][0]["tokens"] text = '"%s"' %(",".join(wordslist)) # All commas must be replaced with "" from here on wordslist = [x.replace(',','""') for x in wordslist] words = "{%s}" %(",".join(wordslist)) lemmas = [x.replace(',','""') for x in nlp["sentences"][0]["lemmas"]] lemmas = "{%s}" %(",".join(lemmas)) pos = [x.replace(',','""') for x in nlp["sentences"][0]["pos"]] pos = "{%s}" %(",".join(pos)) ner = "{%s}" %(",".join(nlp["sentences"][0]["ner"])) # This is a lookup for the terms, using the words dependencies = "{%s}" %(",".join(['""%s""' %(dependency_structure(wordslist,x)) for x in nlp["sentences"][0]["deps_cc"]])) # document_id | sentence | words | lemma | pos_tags | dependencies | ner_tags | sentence_offset | sentence_id for_database = '%s,%s,"%s","%s","%s","%s","%s",%s,%s\n' %(article_id,text,words,lemmas,pos,dependencies,ner,p,sentence_id) filey.writelines(for_database)
from stanford_corenlp_pywrapper import CoreNLP from nltk import * import os proc = CoreNLP("parse", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) #correct subdirectory by coded type goes here #comment all this to do a single text file instead of a directory path = 'data/engelhard/A/' for filename in os.listdir(path): print(filename) with open(path+filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') trees = proc.parse_doc(engelhard2) print(engelhard2) #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like: #ssplit for tokenization and sentence splitting #pos for pos and lemmas #ner for pos and ner and lemmas #parse for pos, lemmas, trees, dependencies #nerparse for parsing with ner, pos, lemmas, dependencies #coref for coreference including constituent parsing #comment this to do coref trees = proc.parse_doc(engelhard2) #print(trees) #uncomment this to do coref
# match: Determine if the RE matches at the beginning of the string. # ^ = beginning of string, $ = end of string so https://www.coursera.org is ignored pattern = re.compile(r'^[A-Za-z]+[-]?[A-Za-z]+$') # stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords (it lowercases she) # from nltk includes "i" stop = set(stopwords.words('english')) stop.add("I") for doc_name in doc_names: # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures # dictionary key = 'sentences', value = list of sentences # each sentence dictionary with key='lemmas', 'tokens', etc # key = 'lemmas', value = list of lemmas parsed = proc.parse_doc(doc_dic[doc_name])["sentences"] sentences = [sentence["lemmas"] for sentence in parsed] #flatten nested list so each element is a token doc_dic_normalized[doc_name] = [lemma for sentence in sentences for lemma in sentence if pattern.match(lemma) and lemma not in stop] # count number of tokens: 5256 len([v for ls in doc_dic_normalized.values() for v in ls]) # save documents for name in doc_dic: f = open(os.path.join(out_file_bios_folder_normalized , name + ".txt"), "w") f.write(" ".join(doc_dic_normalized[name])) f.close()
""" Input is multiple text files. Each text file represents one document. Output is stdout, a stream of 2-column TSV DocID \t JsonAnnotations where the DocID is based on the filename. USAGE proc_text_files.py MODE [files...] e.g. python proc_text_files_to_stdout.py pos *.txt > allpos.anno """ import sys, re, os mode = sys.argv[1] from stanford_corenlp_pywrapper import CoreNLP ss = CoreNLP(mode) # need to override corenlp_jars for filename in sys.argv[2:]: docid = os.path.basename(filename) docid = re.sub(r'\.txt$', "", docid) text = open(filename).read().decode('utf8', 'replace') jdoc = ss.parse_doc(text, raw=True) print("%s\t%s" % (docid, jdoc))
class Ordering(object): def __init__(self): self.proc = CoreNLP('ssplit') def check_tagfrequency(self, entitymap, template): tag_freq = {} for tag, entity in entitymap.items(): tag_freq[tag] = re.findall(tag, template) if 0 not in tag_freq.values(): return True return False # Fixing the tags for the correct order def generate_template(self, triples, template, entitymap): ''' :param triples: :param template: :param entitymap: :return: ''' new_entitymap, predicates = utils.map_entities(triples) new_entitymap = dict( map(lambda x: (x[1].name, x[0]), new_entitymap.items())) new_template = [] for token in template: if token in entitymap: new_template.append(new_entitymap[entitymap[token].name]) else: new_template.append(token) return ' '.join(new_template).replace('-LRB-', '(').replace('-RRB-', ')').strip() def process(self, entry): ''' :param entry: :return: ''' self.entry = entry entitymap, predicates = utils.map_entities(self.entry.triples) training_set = [] for lex in self.entry.texts: template = lex.template delex_type = lex.delex_type if self.check_tagfrequency(entitymap, template): sort_triples, triples = [], copy.deepcopy(entry.triples) out = self.proc.parse_doc(template) prev_tags = [] for i, snt in enumerate(out['sentences']): tags = [] # get tags in order for token in snt['tokens']: if token in entitymap: tags.append(token) # Ordering the triples in the sentence i sort_snt_triples, triples = self.order( triples, entitymap, prev_tags, tags) sort_triples.extend(sort_snt_triples) # Extract template for the sentence if len(triples) == 0: template = [] for snt in out['sentences']: template.extend(snt['tokens']) template = self.generate_template(sort_triples, template, entitymap) training_set.append({ 'sorted_triples': sort_triples, 'triples': entry.triples, 'template': template, 'lexEntry': lex, 'semcategory': entry.category, 'delex_type': delex_type }) return training_set def order(self, triples, entitymap, prev_tags, tags): triples_sorted = [] for i in range(1, len(tags)): tag = tags[i] prev_tags.insert(0, tags[i - 1]) for prev_tag in prev_tags: if 'AGENT' in tag and 'PATIENT' in prev_tag: f = filter( lambda triple: triple.agent.name == entitymap[tag].name and triple.patient.name == entitymap[prev_tag].name, triples) elif 'PATIENT' in tag and 'AGENT' in prev_tag: f = filter( lambda triple: triple.patient.name == entitymap[tag]. name and triple.agent.name == entitymap[prev_tag].name, triples) else: f = filter( lambda triple: (triple.agent.name == entitymap[tag].name and triple. patient.name == entitymap[prev_tag].name) or (triple.patient.name == entitymap[tag].name and triple. agent.name == entitymap[prev_tag].name), triples) if len(f) > 0: triples_sorted.append(f[0]) triples = filter(lambda triple: triple != f[0], triples) break return triples_sorted, triples def update_db(self, trainingset): ''' :param trainingset: set with triples, ordered triples, lexical entry and updateded template :return: ''' for row in trainingset: # Update database with template with right entity order id and ordered triples dbop.save_template(category=row['semcategory'], triples=row['sorted_triples'], template=row['template'], delex_type=row['delex_type']) def write(self, trainingset, fname): result = [] for row in trainingset: lex, triples, sorted_triples, template = row['lexEntry'], row[ 'triples'], row['sorted_triples'], row['template'] row['triples'] = map( lambda triple: triple.agent.name + ' | ' + triple.predicate. name + ' | ' + triple.patient.name, row['triples']) row['sorted_triples'] = map( lambda triple: triple.agent.name + ' | ' + triple.predicate. name + ' | ' + triple.patient.name, row['sorted_triples']) result.append({ 'triples': row['triples'], 'sorted': row['sorted_triples'], 'semcategory': row['semcategory'] }) print row['triples'] print row['sorted_triples'] print template print 10 * '-' json.dump(result, open(fname, 'w'), indent=4, separators=(',', ': '))
class ner(object): def __init__(self, lang='en', en_ner=False): # feature parameters self.lang = lang # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.) if en_ner == 'nltk': self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE'] self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20' self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz' self.sner_main = self.sner_root+'/stanford-ner.jar' self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8') # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy) if en_ner == 'corenlp': self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE'] self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path]) #=========================================== # Standford CoreNLP pywrapper #=========================================== def get_ner_stanford_corenlp(self, txt): tree = self.snlp.parse_doc(txt.upper()) ners = {n: [] for n in self.entity_cols} results = [list(zip(r['ner'], r['tokens'])) for r in tree['sentences']] results = [(k[0], k[1].lower()) for v in results for k in v if k[0] in self.entity_cols] ners = {k: [] for k in self.entity_cols} for k,v in results: ners[k].append(v) ners = {k: list(set(v)) for k,v in ners.items()} return ners # #=========================================== # # Standford CoreNLP (slow but better) # #=========================================== def get_ner_tags(self, text): ners = {} terms = [(k,v) for k,v in self.st.tag(text.split()) if v != 'O'] for t in self.entity_cols: ners[t] = list(set([re.sub('[^0-9a-zA-Z]+', ' ', k.lower()) for k,v in terms if v == t])) return ners #=========================================== # NLTK NER (very bad accuracy, a lot garbage) #=========================================== def get_ner_nltk(self, text): sents = nltk.sent_tokenize(text) # sentences tokenized_sents = [nltk.word_tokenize(s) for s in sents] tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents] chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)] raw = self.traverseTree(chunked_sents) ners = {} for n in self.entity_cols: ners[n] = [] [ners[k].append(v.lower()) for k,v in raw] for n in self.entity_cols: ners[n] = list(set(ners[n])) return ners def traverseTree(self, tree): result = [] for subtree in tree: if type(subtree) == nltk.tree.Tree: if subtree.label() in self.entity_cols: result += [(subtree.label(), subtree[0][0])] else: result += (self.traverseTree(subtree)) return result
if left == right: outputList.append(char) output = ''.join(outputList) return output PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."] monthElement = "january|february|march|april|may|june|july|august|september|october|november|december" dateElement = "1|2|3|4|5|6|7|8|9|0" monthPattern = re.compile(monthElement, re.IGNORECASE) datePattern = re.compile(dateElement, re.IGNORECASE) procCOR = CoreNLP("coref", corenlp_jars=[jar_path]) readFile = (open(file_path)).read() filteredFile = bracketProcess(readFile) dictCOR = procCOR.parse_doc(filteredFile) entitiesCOR = dictCOR['entities'] sentencesCOR = dictCOR['sentences'] replaceList = [] for i in entitiesCOR: mentionList = i['mentions'] if not len(mentionList) == 1: catchList = [] for j in mentionList: item = [j['sentence']] item.append(j['tokspan_in_sentence']) catchList.append(item) replaceList.append(catchList)
corenlp_jars=[ "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*" ]) data_lemmas = copy.deepcopy( data_names) # deep copy otherwise change data_clean since list of objects # lemmatize quotes and description for row in data_lemmas: # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures # dictionary key = 'sentences', value = list of sentences # each sentence dictionary with key='lemmas', 'tokens', etc # key = 'lemmas', value = list of lemmas for field in ("quote", "description"): parsed = proc.parse_doc(row[field])["sentences"] sentences = [sentence["lemmas"] for sentence in parsed] # flatten nested list so each element is a token row_tokenized = [ token.strip() for sentence in sentences for token in sentence if token.strip() not in stop and pattern.match(token.strip()) ] row_string = " ".join(row_tokenized) row[field] = row_string #row[field] = row_tokenized print(data_names[0]) print(data_lemmas[0]) df_data_lemmas = pandas.DataFrame(
def main(): lines = getALines() # format folds inputs fsplits = open("splits") slines = fsplits.readlines() splits = list() for i in range(0, len(slines)): parts = slines[i].strip().split(":") train = list() test = list() for s in parts[0][1:-1].split(", "): train.append(int(s)) for s in parts[1][1:-1].split(", "): test.append(int(s)) splits.append((train, test)) fsplits.close() print("Number of folds: " + str(NUM_SPLITS)) fdict = open("sentiment_dictionary", "r") cv = pickle.loads(fdict.read()) fdict.close() foutput = open("nlu_scores", "w") for fold in range(0, NUM_SPLITS): #for evaluation scores = { j: {i: 0 for i in ['correct', 'guessed', 'actual']} for j in ENT_TYPES } #get utterances in_utter = getUtterances(lines) take_utter = list() for i in range(0, len(in_utter)): if i in splits[fold][1]: take_utter.append(in_utter[i]) in_utter = take_utter fclf = open("classifiers/sentiment_classifier" + str(fold), "r") clf = pickle.loads(fclf.read()) fclf.close() proc = CoreNLP("pos", corenlp_jars=[PATH_TO_STANFORD_CORENLP]) tagger = pycrfsuite.Tagger() tagger.open("taggers/advising_crf_tagger" + str(fold)) #classify utterances for k in range(0, len(in_utter)): print("Current Utterance: " + in_utter[k][0]) #get slots from utterance slots = getSlots(in_utter[k]) print("Slots: " + str(slots)) #constituency parse parsed = proc.parse_doc(in_utter[k][0]) #print(parsed) #print(str(list(parsed['sentences'][0]['tokens']))) print("\n\n\n") print("Number of parsed sentences: " + str(len(parsed['sentences']))) spos_tlist = list() for i in range(0, len(parsed['sentences'])): spos_tuples = zip(parsed['sentences'][i]['tokens'], parsed['sentences'][i]['pos']) spos_tlist.append(spos_tuples) X_test = [ crf_tagger.sent2featuresWithSent(s, in_utter[k][0]) for s in spos_tlist ] y_pred = [tagger.tag(xseq) for xseq in X_test] print(parsed['sentences'][0]['tokens']) print(y_pred[0]) ent_list = {i: [] for i in ENT_TYPES} for i in range(0, len(parsed['sentences'])): etemp = getEntities(parsed['sentences'][i]['tokens'], y_pred[i]) for etype in ENT_TYPES: ent_list[etype].extend(etemp[etype]) for i in ENT_TYPES: print(i + ': ' + str(ent_list[i])) ent_outs = {i: [] for i in ENT_TYPES} for etype in ENT_TYPES: for i in range(len(ent_list[etype])): ent_outs[etype].append( getClassLabel(in_utter[k][0], ent_list[etype][i], y_pred[0], parsed['sentences'][0]['tokens'], cv, clf)) for etype in ENT_TYPES: #generate tuples for comparison of classes tlist = list() seval = list() for q in range(len(slots[etype])): scores[etype]['actual'] += 1 ent_t = { k: v for k, v in slots[etype][q].items() if k in ENT_TYPES[etype] } tlist.append(ent_t) seval.append(slots[etype][q]['sentiment']) for i in range(len(ent_outs[etype])): scores[etype]['guessed'] += 1 if ent_list[etype][i] in tlist: if seval[tlist.index( ent_list[etype][i])] == ent_outs[etype][i]: scores[etype]['correct'] += 1 print('current scores: ' + str(scores)) #print output print("\n\nInput: " + in_utter[k][0]) print("Output: ") for etype in ENT_TYPES: for i in range(len(ent_list[etype])): print(etype + ': ' + str(ent_list[etype][i]) + " - " + str(ent_outs[etype][i])) precision = sum([scores[i]['correct'] for i in ENT_TYPES]) * 1.0 / sum( [scores[i]['guessed'] for i in ENT_TYPES]) recall = sum([scores[i]['correct'] for i in ENT_TYPES]) * 1.0 / sum( [scores[i]['actual'] for i in ENT_TYPES]) ith_row = [precision, recall] for i in ENT_TYPES: tprecision = scores[i]['correct'] * 1.0 / scores[i]['guessed'] trecall = scores[i]['correct'] * 1.0 / scores[i]['actual'] ith_row.append(tprecision) ith_row.append(trecall) foutput.write(str(ith_row) + '\n') foutput.close()
class NER: def __init__(self, lang): self.lang = lang self.config = ner_config def start_server(self): self.corenlp = CoreNLP( corenlp_jars=[ os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]), os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]), ], server_port=self.config[self.lang]["port"], configdict=self.config[self.lang]["properties"], ) print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"]) # text = [paragraphs] (one per line) def query(self, text): if self.lang == "CMN": return self.stanford_ner(text) if self.lang == "SPA": return self.freeling_ner(text) if self.lang == "ENG": return self.stanford_ner(text) def stanford_ner(self, text): mentions = [] for paragraph in text: paragraph_mentions = [] response = self.corenlp.parse_doc(paragraph) sentences = response["sentences"] # print '\n\n', paragraph for sentence in sentences: paragraph_mentions.extend(self.process_stanford_sentence(sentence)) mentions.append(paragraph_mentions) return mentions def process_stanford_sentence(self, sentence): mentions = [] for index, word in enumerate(sentence["tokens"]): ner_type = sentence["ner"][index] if ner_type in stanford_good_entity_types: if index > 0 and sentence["ner"][index - 1] == ner_type: # concat this token with the previous mentions[-1].word += ( " " + word ) # TODO: this is buggy, think of a better way (perhaps using the offsets and sentence.substring(start, end)) mentions[-1].end = sentence["char_offsets"][index][1] else: mentions.append( Mention( word, sentence["char_offsets"][index][0], sentence["char_offsets"][index][1], ner_type, "name", "link", ) ) return mentions def freeling_ner(self, text, name): print "\n\nINPUT TEXT:", text entities = get_entities(text) mentions = [] # build Mentions for (form, count, classification) in entities: print "FREELING FOUND: %s: %d | %s" % (form, count, classification) # word, begin, end, ner, name, link mentions.append(Mention(form, 0, 1, classification, "name", "link")) return mentions
class Postprocessing(object): def __init__(self, fdev, ftest): self.proc = CoreNLP('ssplit') self.get_results(fdev, ftest) # DEV dev_order, dev_gold = [], [] DEV_DIR = u'../data/dev' for dir in os.listdir(DEV_DIR): if dir != u'.DS_Store': f = os.path.join(DEV_DIR, dir) for fname in os.listdir(f): if fname != u'.DS_Store': print os.path.join(f, fname) _order, _gold = self.order(os.path.join(f, fname), u'dev') dev_order.extend(_order) dev_gold.extend(_gold) self.write_hyps(dev_order, fdev + '.ordered') utils.write_references('results/gold/dev.en', dev_gold) # TEST test_order, test_gold = [], [] TEST_FILE = u'../data/test/triples/test.xml' _order, _gold = self.order(TEST_FILE, u'test') test_order.extend(_order) self.write_hyps(test_order, ftest + '.ordered') # save previous orders self.save_prev_order() def get_results(self, fdev, ftest): def read_file(fname): f = open(fname) doc = f.read() f.close() return doc.split('\n') # development set _set = u'dev' entries = Entry.objects(set=_set) devresults = read_file(fdev) self.dev_order, self.dev_key_order = {}, [] self.dev_gold = {} for i, entry in enumerate(entries): semcategory = entry.category size = entry.size docid = entry.docid self.dev_order[(docid, size, semcategory, _set)] = devresults[i] self.dev_key_order.append([docid, size, semcategory, _set]) texts = map(lambda x: x.text, entry.texts) self.dev_gold[(docid, size, semcategory, _set)] = texts # test set _set = u'test' entries = Entry.objects(set=_set) testresults = read_file(ftest) self.test_order, self.test_key_order = {}, [] for i, entry in enumerate(entries): docid= entry.docid self.test_order[docid] = testresults[i] self.test_key_order.append(docid) def order(self, fname, _set): tree = ET.parse(fname) root = tree.getroot() entries = root.find('entries') order = [] gold = [] for _entry in entries: docid = _entry.attrib['eid'] size = int(_entry.attrib['size']) semcategory = _entry.attrib['category'] if _set == u'dev': # print (docid, size, semcategory, _set) # print self.dev_order[(docid, size, semcategory, _set)] # print 10 * '*' order.append(self.dev_order[(docid, size, semcategory, _set)]) gold.append(self.dev_gold[(docid, size, semcategory, _set)]) else: order.append(self.test_order[docid]) return order, gold def save_prev_order(self): f = open('results/dev_prev_order.txt', 'w') for prev in self.dev_key_order: f.write('\t'.join(map(lambda x: str(x), prev))) f.write('\n') f.close() f = open('results/test_prev_order.txt', 'w') for prev in self.test_key_order: f.write(prev) f.write('\n') f.close() def write_hyps(self, order, fname): f = open(fname, 'w') for text in order: out = self.proc.parse_doc(text) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')') text += ' ' f.write(text.encode('utf-8')) f.write('\n') f.close()
def write_references(fname, refs): proc = CoreNLP('ssplit') f1 = open(fname + '1', 'w') f2 = open(fname + '2', 'w') f3 = open(fname + '3', 'w') f4 = open(fname + '4', 'w') f5 = open(fname + '5', 'w') f6 = open(fname + '6', 'w') f7 = open(fname + '7', 'w') for references in refs: out = proc.parse_doc(references[0].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')') text += ' ' f1.write(text.encode('utf-8')) f1.write('\n') if len(references) >= 2: out = proc.parse_doc(references[1].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f2.write(text.encode('utf-8')) f2.write('\n') if len(references) >= 3: out = proc.parse_doc(references[2].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f3.write(text.encode('utf-8')) f3.write('\n') if len(references) >= 4: out = proc.parse_doc(references[3].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f4.write(text.encode('utf-8')) f4.write('\n') if len(references) >= 5: out = proc.parse_doc(references[4].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f5.write(text.encode('utf-8')) f5.write('\n') if len(references) >= 6: out = proc.parse_doc(references[5].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f6.write(text.encode('utf-8')) f6.write('\n') if len(references) >= 7: out = proc.parse_doc(references[6].lower()) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' f7.write(text.encode('utf-8')) f7.write('\n') f1.close() f2.close() f3.close() f4.close() f5.close() f6.close() f7.close()
#mport re import numpy as np from stanford_corenlp_pywrapper import CoreNLP #Loading the Stanford CoreNLP Lib data = "./extracted-quest/quest-en.txt" loc= "/people/panou/Stage/projet/stanford-corenlp-full-2015-04-20/*" #STOPWORDS is the list of words we'd like to discards in our stopwords =[".","?","!",','] proc = CoreNLP("nerparse",corenlp_jars=[loc]) p=[] i=1 with open(data,'r') as inp: for line in inp: print "traitement de la ligne " + str(i) p.append(proc.parse_doc(line)) i+=1 with open('./phrases.txt','w') as out: for elmt in p: #print elmt["sentences"][0]["tokens"] for tok in elmt["sentences"][0]["lemmas"]: if not tok in stopwords: out.write(tok+'\n') out.write('\n')
class ManualDelexicalizer(object): def __init__(self, fname, _set='train'): self.proc = CoreNLP('parse') self._set = _set f = open(fname) doc = f.read() f.close() doc = doc.split((50 * '*') + '\n') print 'Doc size: ', len(doc) for entry in doc: entry = entry.split('\n\n') _, entryId, size, semcategory = entry[0].replace('\n', '').split() entity_map = dict( map(lambda entity: entity.split(' | '), entry[2].replace('\nENTITY MAP\n', '').split('\n'))) lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1] for lex in lexEntries: if lex[0] == '\n': lex = lex[1:] lex = lex.split('\n') lexId = lex[0] text = lex[1].replace('TEXT: ', '').strip() template = lex[2].replace('TEMPLATE: ', '') correct = lex[3].replace('CORRECT: ', '').strip() comment = lex[4].replace('COMMENT: ', '').strip() if comment in ['g', 'good']: print template print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, template) references = self.process_references( text, template, entity_map) self.save_references(references) elif correct != '' and comment != 'wrong': print correct print 10 * '-' self.update_template(entryId, size, semcategory, _set, lexId, correct) references = self.process_references( text, correct, entity_map) self.save_references(references) def _get_references_info(self, out, entities): ''' Get syntactic position, text and sentence status of the references based on dependency parser :param out: stanford corenlp result :param entities: tag - wikipedia id mapping :return: ''' references = [] for tag_entity in entities.iteritems(): tag, entity = tag_entity refs, entity_removals = ref_delex.get_references(out, tag, entity) references.extend(refs) references = sorted(references, key=lambda x: (x['entity'], x['sentence'], x['pos'])) sentence_statuses = {} for i, reference in enumerate(references): if i == 0 or (reference['entity'] != references[i - 1]['entity']): reference['text_status'] = 'new' else: reference['text_status'] = 'given' if reference['sentence'] not in sentence_statuses: sentence_statuses[reference['sentence']] = [] if reference['entity'] not in sentence_statuses[ reference['sentence']]: reference['sentence_status'] = 'new' else: reference['sentence_status'] = 'given' sentence_statuses[reference['sentence']].append( reference['entity']) references = sorted(references, key=lambda x: x['general_pos']) return references def _get_refexes(self, text, template, references): ''' Extract referring expressions for each reference overlapping text and template :param text: original text :param template: template (delexicalized text) :param references: references :return: ''' text = 'BEGIN BEGIN BEGIN ' + text template = 'BEGIN BEGIN BEGIN ' + template isOver = False while not isOver: stemplate = template.split() tag = '' pre_tag, pos_tag, i = [], [], 0 for token in stemplate: i += 1 if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']: tag = token for pos_token in stemplate[i:]: if pos_token.split('-')[0] in [ 'AGENT', 'PATIENT', 'BRIDGE' ]: break else: pos_tag.append(pos_token) break else: pre_tag.append(token) if tag == '': isOver = True else: regex = re.escape(' '.join( pre_tag[-3:]).strip()) + ' (.+?) ' + re.escape(' '.join( pos_tag[:3]).strip()) f = re.findall(regex, text) if len(f) > 0: refex = f[0] template = template.replace(tag, refex, 1) ref_type = 'name' if refex.lower().strip() in [ 'he', 'his', 'him', 'she', 'hers', 'her', 'it', 'its', 'they', 'theirs', 'them' ]: ref_type = 'pronoun' elif refex.lower().strip().split()[0] in [ 'the', 'a', 'an' ]: ref_type = 'description' elif refex.lower().strip().split()[0] in [ 'this', 'these', 'that', 'those' ]: ref_type = 'demonstrative' for ref in references: if ref['tag'] == tag and 'refex' not in ref: ref['refex'] = refex ref['reftype'] = ref_type break else: template = template.replace(tag, ' ', 1) return references def update_template(self, entryId, size, semcategory, _set, lexId, template): entry = Entry.objects(docid=entryId, size=size, category=semcategory, set=_set).first() for lexEntry in entry.texts: if lexEntry.docid == lexId: dbop.insert_template(lexEntry, template, 'manual') break def save_references(self, references): ''' Save references and referring expressions extracted from the manual annotation :param references: :return: ''' for reference in references: if 'refex' in reference: ref = dbop.save_reference( entity=reference['entity'], syntax=reference['syntax'], text_status=reference['text_status'], sentence_status=reference['sentence_status']) dbop.add_refex(ref, reference['reftype'], reference['refex'], 'manual') def process_references(self, text, template, entities): ''' Obtain information of references and their referring expressions :param text: :param template: :param entities: :return: ''' out = self.proc.parse_doc(text) text = [] for i, snt in enumerate(out['sentences']): text.extend(snt['tokens']) text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip() out = self.proc.parse_doc(template)['sentences'] references = self._get_references_info(out, entities) references = self._get_refexes(text, template, references) return references
from stanford_corenlp_pywrapper import CoreNLP from nltk import * import os proc = CoreNLP("parse", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) #correct subdirectory by coded type goes here #comment all this to do a single text file instead of a directory path = 'data/engelhard/A/' for filename in os.listdir(path): print(filename) with open(path + filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') trees = proc.parse_doc(engelhard2) print(engelhard2) #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like: #ssplit for tokenization and sentence splitting #pos for pos and lemmas #ner for pos and ner and lemmas #parse for pos, lemmas, trees, dependencies #nerparse for parsing with ner, pos, lemmas, dependencies #coref for coreference including constituent parsing #comment this to do coref trees = proc.parse_doc(engelhard2) #print(trees) #uncomment this to do coref
class DBInit(object): def __init__(self): self.proc = CoreNLP('parse') self.ner = json.load(open('../data/delexicalization/ner_dict.json')) self.semcategory = json.load( open('../data/delexicalization/delex_dict.json')) self.descriptions = json.load( open('../data/delexicalization/descriptions.json')) def run(self, dir, typeset): self.typeset = typeset for fname in os.listdir(dir): if fname != '.DS_Store': self.proc_file(os.path.join(dir, fname)) def extract_entity_type(self, entity): aux = entity.split('^^') if len(aux) > 1: return aux[-1] aux = entity.split('@') if len(aux) > 1: return aux[-1] return 'wiki' def get_entity_info(self, entity): fner = filter(lambda key: entity in self.ner[key], self.ner) fsemcategory = filter(lambda key: entity in self.semcategory[key], self.semcategory) fdescription = filter(lambda key: entity in self.descriptions[key], self.descriptions) if len(fner) > 0: fner = fner[0] else: fner = '' if len(fsemcategory) > 0: fsemcategory = fsemcategory[0] else: fsemcategory = '' if len(fdescription) > 0: fdescription = fdescription[0] else: fdescription = '' return fner, fsemcategory, fdescription def extract_parse_tree(self, text): out = self.proc.parse_doc(text) parse_trees = [] for snt in out['sentences']: parse_trees.append(snt['parse']) if len(parse_trees) > 1: parse = '(MULTI-SENTENCE ' for tree in parse_trees: parse += tree + ' ' parse = parse.strip() + ')' else: parse = parse_trees[0] return parse def proc_file(self, fname): tree = ET.parse(fname) root = tree.getroot() entries = root.find('entries') for _entry in entries: entry = dbop.save_entry(docid=_entry.attrib['eid'], size=int(_entry.attrib['size']), category=_entry.attrib['category'], set=self.typeset) entities_type = [] # Reading original triples to extract the entities type otripleset = _entry.find('originaltripleset') for otriple in otripleset: e1, pred, e2 = otriple.text.split(' | ') entity1_type = self.extract_entity_type(e1.strip()) entity2_type = self.extract_entity_type(e2.strip()) types = {'e1_type': entity1_type, 'e2_type': entity2_type} entities_type.append(types) # Reading modified triples to extract entities and predicate mtripleset = _entry.find('modifiedtripleset') for i, mtriple in enumerate(mtripleset): e1, pred, e2 = mtriple.text.split(' | ') ner, semcategory, description = self.get_entity_info(e1) entity1 = dbop.save_entity(name=e1.replace('\'', '').strip(), type=entities_type[i]['e1_type'], ner=ner, category=semcategory, description=description) predicate = dbop.save_predicate(pred) ner, semcategory, description = self.get_entity_info(e2) entity2 = dbop.save_entity(e2.replace('\'', '').strip(), entities_type[i]['e2_type'], ner=ner, category=semcategory, description=description) triple = dbop.save_triple(entity1, predicate, entity2) dbop.add_triple(entry, triple) # process lexical entries lexEntries = _entry.findall('lex') for lexEntry in lexEntries: text = lexEntry.text.strip() parse_tree = self.extract_parse_tree(text) lexEntry = dbop.save_lexEntry( docid=lexEntry.attrib['lid'], comment=lexEntry.attrib['comment'], text=text, parse_tree=parse_tree) dbop.add_lexEntry(entry, lexEntry)
os.chdir(os.path.join(os.getcwd(),"bio_output")) for i in range(len(doc)): files[file_names[i]] = doc[i].replace("\n"," ") file = open(file_names[i]+".txt", "w") file.write(doc[i]) file.close() #%% check set of non-word characters and stopword os.chdir("/Users/apple/Documents/MSiA/Fall 2015/Text analytics/HW/hw3") proc = CoreNLP("pos", corenlp_jars=["/Users/apple/corenlp/stanford-corenlp-full-2015-04-20/*"]) for i in files.keys(): text = files[i] parsed = proc.parse_doc(text) to_flat = [x["lemmas"] for x in parsed["sentences"]] words = [item for sublist in to_flat for item in sublist] files[i] = words #%% #import stopwords from nltk stopwords, add 'I' since it is also counted as a stopword stopWord = set(stopwords.words('english')) stopWord.add("I") all_words = [] nonWords = re.compile(r"^\b[a-zA-Z]+-?[a-zA-z]+$") for i in files.keys(): text = files[i] words = [] for w in text: if nonWords.match(w):
if prev_tok_char_end < tok_char_start: output.append(u" ") assert isinstance(tok, unicode) output.append(tok) return u"".join(output) with open ("corpora/" + args.corpus + "/raw/all.extract") as raw: count = 0 for line in csv.reader(raw, delimiter="\t"): count += 1 out = {} pubdate = line[1] headline = line[4] print headline text = proc.parse_doc(un_html_ify(line[5])) try: url = line[6] except IndexError: url = "unknown" for ln in text["sentences"]: ln["as_string"] = sent_to_string(ln) ln["phrases"] = get_phrases(ln["pos"], ln["tokens"]) out["pubdate"] = pubdate out["headline"] = headline out["text"] = text out["url"] = url with open('corpora/' + args.corpus + '/processed/all.anno_plus', 'a') as outfile: json.dump(out, outfile) outfile.write("\n")
from stanford_corenlp_pywrapper import CoreNLP import os proc = CoreNLP( "ner", corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"]) input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset' output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv' #parse files output = open(output_path, 'w') for filename in os.listdir(input_path): try: input_file = open(filename, 'r') x = input_file.read() out = proc.parse_doc(x) ner_tags = out['sentences'][0]['ner'] num_tokens = len(ner_tags) lemmas = out['sentences'][0]['lemmas'] first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and ( i == 0 or ner_tags[i - 1] != "PERSON")) for begin_index in first_indexes: # find the end of the PERSON phrase (consecutive tokens tagged as PERSON) end_index = begin_index + 1 while end_index < num_tokens and ner_tags[end_index] == "PERSON": end_index += 1 end_index -= 1 mention_text = " ".join( map(lambda i: lemmas[i], xrange(begin_index, end_index + 1))) print("%s %s" % (filename, mention_text)) output.write("%s\n" % mention_text) except IndexError:
def get_core_nlp_parse(question): proc = CoreNLP("nerparse", corenlp_jars=[stanford_corenlp_path + "/" + "*"]) core_nlp_parse = proc.parse_doc(question) return core_nlp_parse
p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref'}, #output_types=['pos','parse'], corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"]) data_lemmas = copy.deepcopy(data_names) # deep copy otherwise change data_clean since list of objects # lemmatize quotes and description for row in data_lemmas: # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures # dictionary key = 'sentences', value = list of sentences # each sentence dictionary with key='lemmas', 'tokens', etc # key = 'lemmas', value = list of lemmas for field in ("quote","description"): parsed = proc.parse_doc(row[field])["sentences"] sentences = [sentence["lemmas"] for sentence in parsed] # flatten nested list so each element is a token row_tokenized = [token.strip() for sentence in sentences for token in sentence if token.strip() not in stop and pattern.match(token.strip())] row_string = " ".join(row_tokenized) row[field] = row_string #row[field] = row_tokenized print(data_names[0]) print(data_lemmas[0]) df_data_lemmas = pandas.DataFrame(data_lemmas, columns = ["time","character", "quote", "location", "description"]) df_data_lemmas.to_csv("data_lemmas.csv", sep=",", header=True, index=False, encoding = 'utf-8')
class Preprocessing(object): def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test): self.proc = CoreNLP('ssplit') self.parser = CoreNLP('parse') self.in_train = in_train self.in_dev = in_dev self.out_vocab = out_vocab self.out_train = out_train self.out_dev = out_dev self.out_test = out_test self.text_id = 0 self.trainset() self.testset() def trainset(self): input_vocab, output_vocab, character_vocab = set(), set(), set() train, dev = [], [] train_info, dev_info = [], [] dirs = filter(lambda x: x != '.DS_Store', os.listdir(self.in_train)) for path in dirs: dirs2 = filter(lambda x: x != '.DS_Store', os.listdir(os.path.join(self.in_train, path))) for fname in dirs2: f = open(os.path.join(self.in_train, path, fname)) data, in_vocab, out_vocab, c_vocab = self.annotation_parse(f) input_vocab = input_vocab.union(in_vocab) output_vocab = output_vocab.union(out_vocab) character_vocab = character_vocab.union(c_vocab) text_ids = list(set(map(lambda x: x['text_id'], data))) train_size = int(0.9 * len(text_ids)) random.shuffle(text_ids) train.extend(filter(lambda x: x['text_id'] in text_ids[:train_size], data)) dev.extend(filter(lambda x: x['text_id'] in text_ids[train_size:], data)) info = len(train) * [path + ' ' + fname] train_info.extend(info) info = len(dev) * [path + ' ' + fname] dev_info.extend(info) self.write(self.out_train, train, train_info) self.write(self.out_dev, dev, dev_info) with open(os.path.join(self.out_vocab, 'input_vocab.txt'), 'w') as f: f.write(('\n'.join(list(input_vocab))).encode("utf-8")) with open(os.path.join(self.out_vocab, 'output_vocab.txt'), 'w') as f: f.write(('\n'.join(list(output_vocab))).encode("utf-8")) with open(os.path.join(self.out_vocab, 'character_vocab.txt'), 'w') as f: f.write(('\n'.join(list(character_vocab))).encode("utf-8")) def testset(self): test = [] test_info = [] dirs = filter(lambda x: x != '.DS_Store', os.listdir(self.in_dev)) for path in dirs: dirs2 = filter(lambda x: x != '.DS_Store', os.listdir(os.path.join(self.in_dev, path))) for fname in dirs2: f = open(os.path.join(self.in_dev, path, fname)) data, in_vocab, out_vocab, c_vocab = self.annotation_parse(f) test.extend(data) info = len(data) * [path + ' ' + fname] test_info.extend(info) self.write(self.out_test, test, test_info) def extract_entity_type(self, entity): aux = entity.split('^^') if len(aux) > 1: return aux[-1] aux = entity.split('@') if len(aux) > 1: return aux[-1] return 'wiki' def annotation_parse(self, doc): ''' Parse an annotation document and extract references from the texts :param doc: :return: ''' tree = ET.parse(doc) root = tree.getroot() data = [] input_vocab, output_vocab, character_vocab = set(), set(), set() entries = root.find('entries') for entry in entries: entryId = entry.attrib['eid'] size = entry.attrib['size'] semcategory = entry.attrib['category'] # get entity map entitymap_xml = entry.find('entitymap') entity_map = {} for inst in entitymap_xml: tag, entity = inst.text.split(' | ') entity_map[tag] = entity # Reading original triples to extract the entities type types = [] otripleset = entry.find('originaltripleset') for otriple in otripleset: e1, pred, e2 = otriple.text.split(' | ') entity1_type = self.extract_entity_type(e1.strip()) entity2_type = self.extract_entity_type(e2.strip()) types.append({'e1_type':entity1_type, 'e2_type':entity2_type}) # Reading modified triples to extract entities and classify them according to type mtripleset = entry.find('modifiedtripleset') entity_type = {} for i, mtriple in enumerate(mtripleset): e1, pred, e2 = mtriple.text.split(' | ') entity_type[e1.replace('\'', '')] = types[i]['e1_type'] entity_type[e2.replace('\'', '')] = types[i]['e2_type'] lexEntries = entry.findall('lex') for lex in lexEntries: try: text = lex.find('text').text template = lex.find('template').text if template: print('{}\r'.format(template)) text, template = self.stanford_parse(text, template) references, in_vocab, out_vocab, c_vocab = self.get_refexes(text, template, entity_map, entity_type) data.extend(references) input_vocab = input_vocab.union(in_vocab) output_vocab = output_vocab.union(out_vocab) character_vocab = character_vocab.union(c_vocab) except Exception as e: print('ERROR') print(e.message) return data, input_vocab, output_vocab, character_vocab def stanford_parse(self, text, template): ''' Tokenizing text and template :param text: original text :param template: original template :return: Tokenized text and template ''' out = self.proc.parse_doc(text) text = [] for i, snt in enumerate(out['sentences']): text.extend(snt['tokens']) text = ' '.join(text).replace('-LRB-', '(').replace('-RRB-', ')').strip() out = self.proc.parse_doc(template) temp = [] for i, snt in enumerate(out['sentences']): temp.extend(snt['tokens']) template = ' '.join(temp).replace('-LRB-', '(').replace('-RRB-', ')').strip() return text, template def write(self, fname, instances, info): if not os.path.exists(fname): os.mkdir(fname) pre_context = '\n'.join(map(lambda x: x['pre_context'], instances)).encode('utf-8') with open(os.path.join(fname, 'pre_context.txt'), 'w') as f: f.write(pre_context) pos_context = '\n'.join(map(lambda x: x['pos_context'], instances)).encode('utf-8') with open(os.path.join(fname, 'pos_context.txt'), 'w') as f: f.write(pos_context) entity = '\n'.join(map(lambda x: x['entity'], instances)).encode('utf-8') with open(os.path.join(fname, 'entity.txt'), 'w') as f: f.write(entity) refex = '\n'.join(map(lambda x: x['refex'], instances)).encode('utf-8') with open(os.path.join(fname, 'refex.txt'), 'w') as f: f.write(refex) size = '\n'.join(map(lambda x: str(x['size']), instances)) with open(os.path.join(fname, 'size.txt'), 'w') as f: f.write(size) info = '\n'.join(info).encode('utf-8') with open(os.path.join(fname, 'info.txt'), 'w') as f: f.write(info) p.dump(instances, open(os.path.join(fname, 'data.cPickle'), 'w')) def get_reference_info(self, template, tag): ''' get info about a reference like syntactic position :param out: stanford corenlp result :param tag: tag (agent, patient or bridge) :param entity: wikipedia id :return: ''' out = self.parser.parse_doc(template)['sentences'] reference = {'syntax':'', 'sentence':-1, 'pos':-1, 'general_pos':-1, 'tag':tag} general_pos = 0 for i, snt in enumerate(out): deps = snt['deps_cc'] for dep in deps: # get syntax if snt['tokens'][dep[2]] == tag: reference = {'syntax':'', 'sentence':i, 'pos':dep[2], 'general_pos':general_pos+dep[2], 'tag':tag} if 'nsubj' in dep[0] or 'nsubjpass' in dep[0]: reference['syntax'] = 'np-subj' elif 'nmod:poss' in dep[0] or 'compound' in dep[0]: reference['syntax'] = 'subj-det' else: reference['syntax'] = 'np-obj' break general_pos += len(snt['tokens']) return reference def process_template(self, template): ''' Return previous and subsequent tokens from a specific tag in a template :param template: :return: ''' stemplate = template.split() tag = '' pre_tag, pos_tag, i = [], [], 0 for token in stemplate: i += 1 if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']: tag = token for pos_token in stemplate[i:]: if pos_token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']: break else: pos_tag.append(pos_token) break else: pre_tag.append(token) return pre_tag, tag, pos_tag def process_context(self, context, entity_map): ''' Return pre- and pos- wikified context :param context: :param entity_map: :return: ''' scontext = context.split() pre_context, pos_context, i = [], [], 0 for token in scontext: i += 1 if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']: pos_context = scontext[i:] break else: pre_context.append(token) pre_context = ' '.join(['EOS'] + pre_context) pos_context = ' '.join(pos_context + ['EOS']) for tag, entity in entity_map.iteritems(): # pre_context = pre_context.replace(tag, entity_map[tag]) # pos_context = pos_context.replace(tag, entity_map[tag]) pre_context = pre_context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split())) pos_context = pos_context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split())) return pre_context.lower(), pos_context.lower() def classify(self, references): ''' Classify referring expression by their status and form :param references: :return: ''' references = sorted(references, key=lambda x: (x['entity'], x['sentence'], x['pos'])) sentence_statuses = {} for i, reference in enumerate(references): # text status if i == 0 or (reference['entity'] != references[i-1]['entity']): reference['text_status'] = 'new' else: reference['text_status'] = 'given' if reference['sentence'] not in sentence_statuses: sentence_statuses[reference['sentence']] = [] # sentence status if reference['entity'] not in sentence_statuses[reference['sentence']]: reference['sentence_status'] = 'new' else: reference['sentence_status'] = 'given' sentence_statuses[reference['sentence']].append(reference['entity']) # referential form reg = reference['refex'].replace('eos', '').strip() reference['reftype'] = 'name' if reg.lower().strip() in ['he', 'his', 'him', 'she', 'hers', 'her', 'it', 'its', 'we', 'our', 'ours', 'they', 'theirs', 'them']: reference['reftype'] = 'pronoun' elif reg.lower().strip().split()[0] in ['the', 'a', 'an']: reference['reftype'] = 'description' elif reg.lower().strip().split()[0] in ['this', 'these', 'that', 'those']: reference['reftype'] = 'demonstrative' return references def get_refexes(self, text, template, entity_map, entity_type): ''' Extract referring expressions for each reference overlapping text and template :param text: original text :param template: template (delexicalized text) :return: ''' context = copy.copy(template) data, input_vocab, output_vocab, character_vocab = [], set(), set(), set() isOver = False while not isOver: pre_tag, tag, pos_tag = self.process_template(template) pre_context, pos_context = self.process_context(context, entity_map) if tag == '': isOver = True else: # Look for reference from 5-gram to 2-gram i, f = 5, [] while i > 1: begin = ' '.join(i * ['BEGIN']) text = begin + ' ' + text template = begin + ' ' + template pre_tag, tag, pos_tag = self.process_template(template) regex = re.escape(' '.join(pre_tag[-i:]).strip()) + ' (.+?) ' + re.escape(' '.join(pos_tag[:i]).strip()) f = re.findall(regex, text) template = template.replace('BEGIN', '').strip() text = text.replace('BEGIN', '').strip() i -= 1 if len(f) == 1: break if len(f) > 0: # DO NOT LOWER CASE HERE!!!!!! template = template.replace(tag, f[0], 1) refex = f[0] # Do not include literals entity = entity_map[tag] if entity_type[entity] == 'wiki': normalized = '_'.join(entity.replace('\"', '').replace('\'', '').lower().split()) aux = context.replace(tag, 'ENTITY', 1) reference = self.get_reference_info(aux, 'ENTITY') character = ['eos'] + list(refex) + ['eos'] refex = ['eos'] + refex.split() + ['eos'] row = { 'pre_context':pre_context.replace('@', ''), 'pos_context':pos_context.replace('@', ''), 'entity':normalized, 'refex':' '.join(refex), 'size':len(entity_map.keys()), 'syntax':reference['syntax'], 'text_id':self.text_id, 'general_pos':reference['general_pos'], 'sentence':reference['sentence'], 'pos':reference['pos'], 'text':text } data.append(row) output_vocab = output_vocab.union(set(refex)) character_vocab = character_vocab.union(set(character)) input_vocab = input_vocab.union(set(pre_context.split())) input_vocab = input_vocab.union(set(pos_context.split())) input_vocab = input_vocab.union(set([normalized])) context = context.replace(tag, normalized, 1) else: context = context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()), 1) else: template = template.replace(tag, ' ', 1) context = context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()), 1) self.text_id += 1 data = self.classify(data) return data, input_vocab, output_vocab, character_vocab
def get_parallel(set, delex=True, size=10, evaluation=False): entries = Entry.objects(size__lte=size, set=set) proc = CoreNLP('ssplit') de, en, entity_maps = [], [], [] for entry in entries: entity_map, predicates = utils.map_entities(entry.triples) entity2tag = utils.entity2tag(entity_map) source = '' for triple in entry.triples: agent = triple.agent.name tag_agent = entity2tag[agent] predicate = triple.predicate.name patient = triple.patient.name tag_patient = entity2tag[patient] if delex: source += tag_agent else: source += agent source += ' ' source += predicate source += ' ' if delex: source += tag_patient else: source += patient source += ' ' if not DELEX and set in ['train', 'dev'] and not evaluation: de.append(agent) name = ' '.join( agent.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) de.append(patient) name = ' '.join( patient.replace('\'', '').replace('\"', '').split('_')) out = proc.parse_doc(name) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' en.append(text.strip()) target_list = [] for lexEntry in entry.texts: if delex and not evaluation: target = lexEntry.template else: target = lexEntry.text out = proc.parse_doc(target) text = '' for snt in out['sentences']: text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace( '-RRB-', ')') text += ' ' target = text.strip() target_list.append(target) print source print target print 10 * '-' if not evaluation: entity_maps.append(entity_map) de.append(source.strip()) en.append(target) if evaluation: entity_maps.append(entity_map) de.append(source.strip()) en.append(target_list) elif set == 'test': entity_maps.append(entity_map) de.append(source.strip()) return de, en, entity_maps
f.write(line) # no need to add "/n" because already in string f.close() #%% Tokenize ################################################################ # sentence segmentation of each line (each element is a list of sentences) # Seprating by sentene is ok because bigrams like (".", "I") will be ignored tokenized_lines = [] # each element is a list with tokens of a line tokenized_sentences = [] # each element is a list with tokens of sentence for line in text: # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures # dictionary key = 'sentences', value = list of sentences # each sentence dictionary with key='lemmas', 'tokens', etc # key = 'lemmas', value = list of lemmas parsed = proc.parse_doc(line)["sentences"] sentences = [sentence["lemmas"] for sentence in parsed] # flatten nested list so each element is a token line_tokenized = [sentence for sublist in sentences for sentence in sublist] # add to list where each element is a tokenized line if line_tokenized != []: tokenized_lines.append(line_tokenized) # add to list where each element is a tokenized sentence for sentence in sentences: tokenized_sentences.append(sentence) # save to file len(tokenized_lines) # 182 lines f = open(out_file_name_normalized_line, "w")
import glob import json from stanford_corenlp_pywrapper import CoreNLP proc = CoreNLP( "pos", corenlp_jars=["/Users/ahandler/stanford-corenlp-full-2015-04-20/*"]) for fn in glob.glob("demos_congress/*txt"): with open(fn, "r") as inf: procd = proc.parse_doc(inf.read()) with open(fn.replace(".txt", ".anno"), "w") as outf: outf.write(json.dumps(procd)) for fn in glob.glob("demos/*txt"): with open(fn, "r") as inf: procd = proc.parse_doc(inf.read()) with open(fn.replace(".txt", ".anno"), "w") as outf: outf.write(json.dumps(procd)) for fn in glob.glob("demos_wilkerson/*txt"): with open(fn, "r") as inf: procd = proc.parse_doc(inf.read()) with open(fn.replace(".txt", ".anno"), "w") as outf: outf.write(json.dumps(procd))
from stanford_corenlp_pywrapper import CoreNLP from pprint import pprint import glob proc = CoreNLP("ssplit", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) path = 'data/engelhard/0/' for filename in glob.glob(path+'*.txt'): print(filename) with open(filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') print(engelhard2) a = proc.parse_doc(engelhard2) pprint(a['sentences'][0]['tokens'])
""" Input is multiple text files. Each text file represents one document. Output is just as many text files, with the ".anno" extension instead. Each output file consists of one JSON object. USAGE proc_text_files.py MODE [files...] e.g. python proc_text_files.py pos *.txt """ import sys, re mode = sys.argv[1] from stanford_corenlp_pywrapper import CoreNLP ss = CoreNLP(mode, corenlp_jars=["/Users/Doctor_Einstein/Documents/stockMartket/analysis/nlp/stanford/*"]) for filename in sys.argv[2:]: outfile = re.sub(r'\.txt$',"", filename) + ".anno" print>>sys.stderr, "%s -> %s" % (filename, outfile) text = open(filename).read().decode('utf8', 'replace') jdoc = ss.parse_doc(text, raw=True) with open(outfile, 'w') as fp: print>>fp, jdoc
def main(): proc = CoreNLP("pos", corenlp_jars=["/home/is/daiki-ku/opt/stanford-corenlp-full-2016-10-31/*"]) proc.parse_doc("hello world. how are you?")
current_nonos += sum([1 for tok in cur_mapped if tok != 'O']) #print('Non-Os is now: ' + str(current_nonos)) #print(cur_mapped) #print('\n\n\n') assert len(cur_mapped) == len(cur_pos) and len(cur_pos) == len( cur_parsed) for i in range(0, len(cur_mapped)): out_file.write(cur_parsed[i] + '\t' + cur_pos[i] + '\t' + cur_mapped[i] + '\n') out_file.write('\n') in_annotations = False continue if not in_annotations: cur_line = line.replace('/', ' / ').replace('EECS', ' EECS ').replace( 'eecs', ' eecs ').replace(' ', ' ') cur_parsed = proc.parse_doc(cur_line) cur_pttok, cur_postk = [], [] for sent in cur_parsed['sentences']: cur_pttok.extend(sent['tokens']) cur_postk.extend(sent['pos']) cur_parsed = cur_pttok cur_pos = cur_postk #print(line) #print(cur_parsed) cur_mapped = ['O'] * len(cur_pttok) current_nonos = 0 in_annotations = True else: # parse annotations anno = line[1:] if line.startswith('<') else line anno = anno[:-1] if line.endswith('>') else anno
from stanford_corenlp_pywrapper import CoreNLP from pprint import pprint import glob proc = CoreNLP("ssplit", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"]) path = 'data/engelhard/0/' for filename in glob.glob(path + '*.txt'): print(filename) with open(filename, 'rU') as f: engelhard = f.read() engelhard2 = engelhard.decode('utf8', 'ignore') print(engelhard2) a = proc.parse_doc(engelhard2) pprint(a['sentences'][0]['tokens'])