예제 #1
0
def get_test_references():
    de, en = [], []
    proc = CoreNLP('ssplit')

    # Insert test references in training data
    entries = Entry.objects(set='test')
    for entry in entries:
        for triple in entry.triples:
            agent = triple.agent.name
            patient = triple.patient.name

            de.append(agent)
            name = ' '.join(
                agent.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())

            de.append(patient)
            name = ' '.join(
                patient.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())
    return de, en
 def __parse_text(self):
     if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)):
         self.__load_parse_result()
         return
     ss = CoreNLP('parse', corenlp_jars = ['~/software/stanford-corenlp-full-2015-12-09/*'])
     self.parsed = ss.parse_doc(self.sentences)
     ss.cleanup()
예제 #3
0
def phrases():
    #STOPWORDS is the list of words we'd like to discards in our 
    stopwords =[".","?","!",',']
    proc = CoreNLP("nerparse",corenlp_jars=[java])
    p=[]
    i=1
    print "####  Traitement et mise en forme des questions extraites  ####"
    with open(quest,'r') as inp:
        for line in inp:
            print "traitement de la ligne " + str(i)
            p.append(proc.parse_doc(line))
            i+=1
    with open('./output/phrases.txt','w') as outp:
        with open('./output/ressources1.txt','w') as outr:
            for elmt in p:
                for tok in elmt["sentences"][0]["lemmas"]:
                    if not tok in stopwords: 
                        a =tok
                        print a 
                        outr.write(a+'\n'.decode().encode('utf-8'))
                        outr.write('\n'.decode().encode('utf-8'))
                for tok in elmt["sentences"][0]["tokens"]:
                    if not tok in stopwords: 
                        outp.write(tok.decode().encode('utf-8')+'\n'.decode().encode('utf-8'))
                        outp.write('\n'.decode().encode('utf-8'))
예제 #4
0
class SimpleREG(object):
    def run(self, fin, fout):
        self.proc = CoreNLP('ssplit')

        entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle')))

        f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev'))
        texts = f.read().lower().split('\n')
        f.close()

        print len(texts), len(entity_maps)

        for i, text in enumerate(texts[:-1]):
            entity_map = entity_maps[i]
            for tag in entity_map:
                name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_'))
                texts[i] = texts[i].replace(tag.lower(), str(name))

        f = open(fout, 'w')
        for text in texts:
            out = self.proc.parse_doc(text)['sentences']

            text = []
            for i, snt in enumerate(out):
                text.extend(snt['tokens'])
            text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip()

            f.write(text.encode('utf-8'))
            f.write('\n')
        f.close()
예제 #5
0
파일: clean.py 프로젝트: nitin7/WordBreak
def main(arg):
    dir = os.path.dirname(__file__)
    filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*')
    configFileLoc = os.path.join(dir, 'config.ini')
    proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename])
    with open(arg, "r") as file:
        data = removeHeadings(file)
        parsed = proc.parse_doc(data)
        data = []
        for s in parsed[u'sentences']:
            sent = str(' '.join(s[u'tokens']))
            data.append(sent.translate(string.maketrans("",""), string.punctuation))

        data1 = ".".join(data)
        data1 = data1.replace("..",".")
        data1 = data1.replace("  "," ")
        data1 = data1.replace(" .",". ")
        data2 = " ".join(data)
        data2 = data2.replace("  "," ")
        file_train1 = open("data/a1_train1.txt", "w")
        file_train1.write(data1)
        file_train1.close()
        
        file_train2 = open("data/a1_train2.txt", "w")
        file_train2.write(data2)
        file_train2.close()
        
        file_test1 = open("data/a1_test1.txt", "w")
        file_test1.write(clean1(data1))
        file_test1.close()

        file_test2 = open("data/a1_test2.txt", "w")
        file_test2.write(clean(data2))
        file_test2.close()
예제 #6
0
 def __parse_text(self):
     if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)):
         self.__load_parse_result()
         return
     ss = CoreNLP(
         'parse',
         corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*'])
     self.parsed = ss.parse_doc(self.sentences)
     ss.cleanup()
예제 #7
0
class StanfordNLP:
    def __init__(self):
        #self.server = ServerProxy(JsonRpc20(),
        #                         TransportTcpIp(addr=("127.0.0.1", 8080)))
        corenlp_dir="/usr/local/lib/stanford-corenlp-full-2017-06-09/*"

        self.server = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner','depparse.model':'edu/stanford/nlp/models/parser/nndep/english_SD.gz'}, corenlp_jars=[corenlp_dir])

    def parse(self, text):
        return self.server.parse_doc(text)
class StanfordPreprocessor(object):
    def __init__(self, homedir='./'):
        from stanford_corenlp_pywrapper import CoreNLP
        self.corenlp = CoreNLP(
            configdict={
                'annotators': 'tokenize, ssplit, pos, lemma, parse, ner'
            },
            output_types=['pos', 'lemma', 'parse', 'ner'],
            corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])

    def parse(self, document):
        return self.corenlp.parse_doc(document)
예제 #9
0
class SentenceDelimiter():
    def __init__(self, corenlp_path):
        self.proc = CoreNLP("ssplit", corenlp_jars=[os.path.join(corenlp_path, '*')])

    def get_sentences(self, text):
        res = self.proc.parse_doc(text)
        for sentence in res['sentences']:
            sentence_text = ' '.join(sentence['tokens']).encode('utf8')
            sentence_text = ' '.join(sentence_text.split())
            sentence_text = sentence_text.replace('-LRB-', '(').replace('-RRB-', ')')
            sentence_text = sentence_text.replace('-LSB-', '[').replace('-RSB-', ']')
            sentence_text = sentence_text.replace('-LCB-', '{').replace('-RCB-', '}')
            yield escape(sentence_text)
예제 #10
0
def lemmatize(l):
    result = []

    from stanford_corenlp_pywrapper import CoreNLP
    proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')

    for doc_words in l:
        single_dict = proc.parse_doc(doc_words)
        row = []
        for each_dict in single_dict['sentences']:
            for word in each_dict['lemmas']:
                row.append(word)
        result.append(row)

    return result
예제 #11
0
def write_hyps(hyps, fname):
    proc = CoreNLP('ssplit')

    f = open(fname, 'w')
    for hyp in hyps:
        out = proc.parse_doc(hyp)
        text = ''
        for snt in out['sentences']:
            text += ' '.join(snt['tokens']).replace('-LRB-',
                                                    '(').replace('-RRB-', ')')
            text += ' '

        f.write(text.encode('utf-8'))
        f.write('\n')
    f.close()
예제 #12
0
class BioNLPEnrichment(BaseEnrichment):
    """
    """
    def __init__(self):
        """
        Load and initialize any external models or data here
        """
        self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"])
    def enrichment_value(self,tweet):
        """ Calculate enrichment value """
        rep = self.corenlp.parse_doc(tweet["actor"]["summary"])
        return rep

    def __repr__(self):
        """ Add a description of the class's function here """
        return("Stanford core NLP applied to user bio")
예제 #13
0
class StanfordNLP:
    def __init__(self):
        #self.server = ServerProxy(JsonRpc20(),
        #                         TransportTcpIp(addr=("127.0.0.1", 8080)))
        corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*"

        self.server = CoreNLP(configdict={
            'annotators':
            'tokenize,ssplit,pos,depparse,lemma,ner',
            'depparse.model':
            'edu/stanford/nlp/models/parser/nndep/english_SD.gz'
        },
                              corenlp_jars=[corenlp_dir])

    def parse(self, text):
        return self.server.parse_doc(text)
예제 #14
0
class kawata_corenlp_handler:
    def __init__(self):
        # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"])
        self.proc = CoreNLP(
            configdict={
                'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'
            },
            corenlp_jars=[
                "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*",
                "/Users/akira/stanford-corenlp-full-2015-12-09/sutime"
            ])

    def __join_text_date(self, text, date):
        '''
        Join text and date.
        '''
        date_s = dt.date2str(date)
        return '[<date>{0}</date>]\n{1}'.format(date_s, text)

    def get_words(self, text, date):
        n_text = unidecode.unidecode(text)
        joint_text = self.__join_text_date(n_text, date)
        joint_text = n_text

        p = self.proc.parse_doc(joint_text)["sentences"][0]
        # print p
        words = list()
        words = zip(p["ner"], p["tokens"], p["ner"])
        stop = stopwords.words("english")
        words = filter(lambda x: x[1] not in stop, words)
        words = map(lambda x: (x[0], x[1].lower(), x[2]), words)
        # I cannot understand what is most suitable in above line.
        ws = list()
        w = ("", "", "")
        for v in words:
            if v[0] != 'O' and v[0] == w[0]:
                w = (w[0], w[1] + " " + v[1], w[2])
            else:
                ws.append(w)
                w = v
        if w[0] != "":
            ws.append(w)
        words = ws

        return words[1:]
예제 #15
0
class BodyNLPEnrichment(BaseEnrichment):
    """
    """

    def __init__(self):
        """
        Load and initialize any external models or data here
        """
        self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"])

    def enrichment_value(self, tweet):
        """ Calculate enrichment value """
        rep = self.corenlp.parse_doc(tweet["body"])
        return rep

    def __repr__(self):
        """ Add a description of the class's function here """
        return "Stanford core NLP applied to tweet body"
예제 #16
0
def split_and_tokenize(doc):
    '''
    Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP.
    More info: https://github.com/brendano/stanford_corenlp_pywrapper
    :param doc: path to the
    :return:
    '''
    parse_mode = "ssplit"  # tokenization and sentence splitting
    coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*"

    parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath])

    json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json"
    if not os.path.exists(json_name):
        doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0]
        document = codecs.open(doc_path, "r", encoding="utf-8").read()
        data_source_parse = parser.parse_doc(document)

        with open(json_name, 'w') as fp:
            json.dump(data_source_parse, fp, sort_keys=True, indent=2)
예제 #17
0
def main():
    if not os.path.exists(IN_FILE + '_rf'):
        print('First reformatting file...')
        out_format = open(IN_FILE + '_rf', 'w')
        with open(IN_FILE) as handle:
            for line in tqdm(handle):
                tline = line.strip()
                if tline == '':
                    out_format.write('\n')
                else:
                    out_format.write(tline + ' ')

    print('Sentence tokenizer!')
    print('Loading Stanford CoreNLP...')
    proc = CoreNLP(configdict={
        'annotators': 'tokenize,ssplit',
        'tokenize.options': 'ptb3Escaping=False'
    },
                   output_types=['tokenize,ssplit'],
                   corenlp_jars=[CORENLP_PATH])

    out_file = open(IN_FILE + '_sts', 'w')
    sentence_count = 0

    print('Opening file ' + IN_FILE + '_rf' + '...')
    with open(IN_FILE + '_rf') as handle:
        lines = handle.readlines()
        for line in tqdm(lines):
            the_text = line.strip()
            # Use Stanford instead
            parsed = proc.parse_doc(the_text)

            sentence_count += len(parsed['sentences'])
            for sent in parsed['sentences']:
                the_tokens = [i.replace(' ', '') for i in sent['tokens']]
                the_sent = ' '.join(the_tokens)
                assert len(the_sent.split(' ')) == len(sent['tokens'])
                out_file.write(the_sent.encode('utf-8') + '\n')
    print('Number of sentences so far: ' + '{:,}'.format(sentence_count))

    out_file.close()
예제 #18
0
class StanfordParser(object):
    """
    Stanford parser
    """
    def __init__(self, corenlp_jars):
        self.proc = CoreNLP("parse", corenlp_jars=corenlp_jars)

    def parse(self, text):
        # {u'sentences':
        #     [
        #         {u'parse': u'(ROOT (S (VP (NP (INTJ (UH hello)) (NP (NN world)))) (. !)))'
        #          u'tokens': [u'hello', u'world', u'.'],
        #          u'lemmas': [u'hello', u'world', u'.'],
        #          u'pos': [u'UH', u'NN', u'.'],
        #          u'char_offsets': [[0, 5], [6, 11], [11, 12]]
        #          },
        #         ...
        #     ]
        # }
        json_rst = self.proc.parse_doc(text)
        if json_rst:
            for sent in json_rst['sentences']:
                parse_tree= sent['parse']
                yield parse_tree
예제 #19
0
from stanford_corenlp_pywrapper import CoreNLP
import os
proc = CoreNLP("ner", corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"])
input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset'
output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv'
#parse files
output = open(output_path,'w')
for filename in os.listdir(input_path):
	try:
		input_file = open(filename,'r')
		x = input_file.read()
		out = proc.parse_doc(x)
		ner_tags = out['sentences'][0]['ner']
		num_tokens = len(ner_tags)
		lemmas = out['sentences'][0]['lemmas']
		first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON"))
		for begin_index in first_indexes:
		    # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
		    end_index = begin_index + 1
		    while end_index < num_tokens and ner_tags[end_index] == "PERSON":
		    	end_index += 1
		    end_index -= 1
		    mention_text = " ".join(map(lambda i: lemmas[i], xrange(begin_index, end_index + 1)))
		    print("%s %s" % (filename, mention_text))
		    output.write("%s\n" % mention_text)
	except IndexError:
		pass
예제 #20
0
    word = dependency[0]
    start = dependency[1]
    end = dependency[2]
    structure = word.encode("utf-8") 
    # Indexing starts at 1, so we add 1
    if word == "root":
        structure = "%s(ROOT-%s, %s-%s)" %(structure,start+1,words[end],end+1)
    else:
        structure = "%s(%s-%s, %s-%s)" %(structure,words[start],start+1,words[end],end+1)
    return structure.encode("utf-8")

for p in range(0,len(paragraphs)):
    paratext = paragraphs[p].replace("<p>","").replace("</p>","").replace("\t"," ").replace('"',"''").replace(",","")
    sentence_id = "%s@%s" %(article_id,p)
    try:
        nlp = proc.parse_doc(paratext)
        wordslist = nlp["sentences"][0]["tokens"]
        text = '"%s"' %(",".join(wordslist))
        # All commas must be replaced with "" from here on
        wordslist = [x.replace(',','""') for x in wordslist]
        words = "{%s}" %(",".join(wordslist))
        lemmas = [x.replace(',','""') for x in nlp["sentences"][0]["lemmas"]]
        lemmas = "{%s}" %(",".join(lemmas))
        pos = [x.replace(',','""') for x in nlp["sentences"][0]["pos"]]
        pos = "{%s}" %(",".join(pos))
        ner = "{%s}" %(",".join(nlp["sentences"][0]["ner"]))
        # This is a lookup for the terms, using the words
        dependencies = "{%s}" %(",".join(['""%s""' %(dependency_structure(wordslist,x)) for x in nlp["sentences"][0]["deps_cc"]]))
        # document_id | sentence | words | lemma | pos_tags | dependencies | ner_tags | sentence_offset | sentence_id 
        for_database = '%s,%s,"%s","%s","%s","%s","%s",%s,%s\n' %(article_id,text,words,lemmas,pos,dependencies,ner,p,sentence_id)
        filey.writelines(for_database)
예제 #21
0
from stanford_corenlp_pywrapper import CoreNLP
from nltk import *
import os

proc = CoreNLP("parse", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])

#correct subdirectory by coded type goes here
#comment all this to do a single text file instead of a directory
path = 'data/engelhard/A/'
for filename in os.listdir(path):
  print(filename)
  with open(path+filename, 'rU') as f:
    engelhard = f.read()
    engelhard2 = engelhard.decode('utf8', 'ignore')
    trees = proc.parse_doc(engelhard2)
    print(engelhard2)

  #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like:
  #ssplit for tokenization and sentence splitting
  #pos for pos and lemmas
  #ner for pos and ner and lemmas
  #parse for pos, lemmas, trees, dependencies
  #nerparse for parsing with ner, pos, lemmas, dependencies
  #coref for coreference including constituent parsing

  #comment this to do coref
  trees = proc.parse_doc(engelhard2)
  #print(trees)

  #uncomment this to do coref
예제 #22
0
파일: hw3.py 프로젝트: linshiu/courses
# match: Determine if the RE matches at the beginning of the string.
# ^ = beginning of string, $ = end of string so https://www.coursera.org is ignored
pattern = re.compile(r'^[A-Za-z]+[-]?[A-Za-z]+$')

# stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords (it lowercases she)
# from nltk includes "i"
stop = set(stopwords.words('english'))
stop.add("I")

for doc_name in doc_names:
    
    # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures
    # dictionary key = 'sentences', value = list of sentences
    # each sentence dictionary with key='lemmas', 'tokens', etc
    # key = 'lemmas', value = list of lemmas 
    parsed = proc.parse_doc(doc_dic[doc_name])["sentences"]
    sentences = [sentence["lemmas"] for sentence in parsed]
    #flatten nested list so each element is a token
    doc_dic_normalized[doc_name] = [lemma for sentence in sentences for lemma in sentence 
        if pattern.match(lemma) and lemma not in stop]
    
# count number of tokens: 5256
len([v for ls in doc_dic_normalized.values() for v in ls])
    
# save documents
for name in doc_dic:
    f = open(os.path.join(out_file_bios_folder_normalized , name + ".txt"), "w")
    f.write(" ".join(doc_dic_normalized[name]))
    f.close()
    
예제 #23
0
"""
Input is multiple text files.  Each text file represents one document.
Output is stdout, a stream of 2-column TSV
  DocID  \t  JsonAnnotations
where the DocID is based on the filename.

USAGE
proc_text_files.py MODE  [files...]

e.g.
python proc_text_files_to_stdout.py pos *.txt > allpos.anno
"""

import sys, re, os
mode = sys.argv[1]

from stanford_corenlp_pywrapper import CoreNLP
ss = CoreNLP(mode)  # need to override corenlp_jars

for filename in sys.argv[2:]:
    docid = os.path.basename(filename)
    docid = re.sub(r'\.txt$', "", docid)

    text = open(filename).read().decode('utf8', 'replace')
    jdoc = ss.parse_doc(text, raw=True)
    print("%s\t%s" % (docid, jdoc))
예제 #24
0
class Ordering(object):
    def __init__(self):
        self.proc = CoreNLP('ssplit')

    def check_tagfrequency(self, entitymap, template):
        tag_freq = {}
        for tag, entity in entitymap.items():
            tag_freq[tag] = re.findall(tag, template)

        if 0 not in tag_freq.values():
            return True
        return False

    # Fixing the tags for the correct order
    def generate_template(self, triples, template, entitymap):
        '''
        :param triples:
        :param template:
        :param entitymap:
        :return:
        '''
        new_entitymap, predicates = utils.map_entities(triples)
        new_entitymap = dict(
            map(lambda x: (x[1].name, x[0]), new_entitymap.items()))
        new_template = []
        for token in template:
            if token in entitymap:
                new_template.append(new_entitymap[entitymap[token].name])
            else:
                new_template.append(token)
        return ' '.join(new_template).replace('-LRB-',
                                              '(').replace('-RRB-',
                                                           ')').strip()

    def process(self, entry):
        '''
        :param entry:
        :return:
        '''
        self.entry = entry
        entitymap, predicates = utils.map_entities(self.entry.triples)

        training_set = []
        for lex in self.entry.texts:
            template = lex.template
            delex_type = lex.delex_type

            if self.check_tagfrequency(entitymap, template):
                sort_triples, triples = [], copy.deepcopy(entry.triples)
                out = self.proc.parse_doc(template)

                prev_tags = []
                for i, snt in enumerate(out['sentences']):
                    tags = []

                    # get tags in order
                    for token in snt['tokens']:
                        if token in entitymap:
                            tags.append(token)

                    # Ordering the triples in the sentence i
                    sort_snt_triples, triples = self.order(
                        triples, entitymap, prev_tags, tags)
                    sort_triples.extend(sort_snt_triples)

                # Extract template for the sentence
                if len(triples) == 0:
                    template = []
                    for snt in out['sentences']:
                        template.extend(snt['tokens'])
                    template = self.generate_template(sort_triples, template,
                                                      entitymap)
                    training_set.append({
                        'sorted_triples': sort_triples,
                        'triples': entry.triples,
                        'template': template,
                        'lexEntry': lex,
                        'semcategory': entry.category,
                        'delex_type': delex_type
                    })
        return training_set

    def order(self, triples, entitymap, prev_tags, tags):
        triples_sorted = []
        for i in range(1, len(tags)):
            tag = tags[i]
            prev_tags.insert(0, tags[i - 1])

            for prev_tag in prev_tags:
                if 'AGENT' in tag and 'PATIENT' in prev_tag:
                    f = filter(
                        lambda triple: triple.agent.name == entitymap[tag].name
                        and triple.patient.name == entitymap[prev_tag].name,
                        triples)
                elif 'PATIENT' in tag and 'AGENT' in prev_tag:
                    f = filter(
                        lambda triple: triple.patient.name == entitymap[tag].
                        name and triple.agent.name == entitymap[prev_tag].name,
                        triples)
                else:
                    f = filter(
                        lambda triple:
                        (triple.agent.name == entitymap[tag].name and triple.
                         patient.name == entitymap[prev_tag].name) or
                        (triple.patient.name == entitymap[tag].name and triple.
                         agent.name == entitymap[prev_tag].name), triples)

                if len(f) > 0:
                    triples_sorted.append(f[0])
                    triples = filter(lambda triple: triple != f[0], triples)
                    break
        return triples_sorted, triples

    def update_db(self, trainingset):
        '''
        :param trainingset: set with triples, ordered triples, lexical entry and updateded template
        :return:
        '''
        for row in trainingset:
            # Update database with template with right entity order id and ordered triples
            dbop.save_template(category=row['semcategory'],
                               triples=row['sorted_triples'],
                               template=row['template'],
                               delex_type=row['delex_type'])

    def write(self, trainingset, fname):
        result = []

        for row in trainingset:
            lex, triples, sorted_triples, template = row['lexEntry'], row[
                'triples'], row['sorted_triples'], row['template']

            row['triples'] = map(
                lambda triple: triple.agent.name + ' | ' + triple.predicate.
                name + ' | ' + triple.patient.name, row['triples'])
            row['sorted_triples'] = map(
                lambda triple: triple.agent.name + ' | ' + triple.predicate.
                name + ' | ' + triple.patient.name, row['sorted_triples'])
            result.append({
                'triples': row['triples'],
                'sorted': row['sorted_triples'],
                'semcategory': row['semcategory']
            })

            print row['triples']
            print row['sorted_triples']
            print template
            print 10 * '-'

        json.dump(result, open(fname, 'w'), indent=4, separators=(',', ': '))
예제 #25
0
파일: ner.py 프로젝트: Marsan-Ma/tnative
class ner(object):
  def __init__(self, lang='en', en_ner=False):
    # feature parameters
    self.lang = lang

    # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.)
    if en_ner == 'nltk':
      self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE']
      self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20'
      self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz'
      self.sner_main = self.sner_root+'/stanford-ner.jar'
      self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8')

    # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy)
    if en_ner == 'corenlp':
      self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE']
      self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path])


  #===========================================
  #   Standford CoreNLP pywrapper
  #===========================================
  def get_ner_stanford_corenlp(self, txt):
    tree = self.snlp.parse_doc(txt.upper())
    ners = {n: [] for n in self.entity_cols}
    results = [list(zip(r['ner'], r['tokens'])) for r in tree['sentences']]
    results = [(k[0], k[1].lower()) for v in results for k in v if k[0] in self.entity_cols]
    ners = {k: [] for k in self.entity_cols}
    for k,v in results: ners[k].append(v)
    ners = {k: list(set(v)) for k,v in ners.items()}
    return ners

  # #===========================================
  # #   Standford CoreNLP (slow but better)
  # #===========================================
  def get_ner_tags(self, text):
    ners = {}
    terms = [(k,v) for k,v in self.st.tag(text.split()) if v != 'O']
    for t in self.entity_cols:
      ners[t] = list(set([re.sub('[^0-9a-zA-Z]+', ' ', k.lower()) for k,v in terms if v == t]))
    return ners

  #===========================================
  #   NLTK NER (very bad accuracy, a lot garbage)
  #===========================================
  def get_ner_nltk(self, text):
    sents = nltk.sent_tokenize(text)  # sentences
    tokenized_sents = [nltk.word_tokenize(s) for s in sents]
    tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents]
    chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)]
    raw = self.traverseTree(chunked_sents)
    ners = {}
    for n in self.entity_cols: ners[n] = []
    [ners[k].append(v.lower()) for k,v in raw]
    for n in self.entity_cols: ners[n] = list(set(ners[n]))
    return ners

  def traverseTree(self, tree):
    result = []
    for subtree in tree:
      if type(subtree) == nltk.tree.Tree:
        if subtree.label() in self.entity_cols:
          result += [(subtree.label(), subtree[0][0])]
        else:
          result += (self.traverseTree(subtree))
    return result
예제 #26
0
		if left == right:
			outputList.append(char)
	output = ''.join(outputList)
	return output


PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."]
monthElement = "january|february|march|april|may|june|july|august|september|october|november|december"
dateElement = "1|2|3|4|5|6|7|8|9|0"
monthPattern = re.compile(monthElement, re.IGNORECASE)
datePattern = re.compile(dateElement, re.IGNORECASE)

procCOR = CoreNLP("coref", corenlp_jars=[jar_path])
readFile = (open(file_path)).read()
filteredFile = bracketProcess(readFile)
dictCOR = procCOR.parse_doc(filteredFile)
entitiesCOR = dictCOR['entities']
sentencesCOR = dictCOR['sentences']


replaceList = []
for i in entitiesCOR:
	mentionList = i['mentions']
	if not len(mentionList) == 1:
		catchList = []
		for j in mentionList:
			item = [j['sentence']]
			item.append(j['tokspan_in_sentence'])
			catchList.append(item)
		replaceList.append(catchList)
예제 #27
0
    corenlp_jars=[
        "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"
    ])

data_lemmas = copy.deepcopy(
    data_names)  # deep copy otherwise change data_clean since list of objects

# lemmatize quotes and description
for row in data_lemmas:
    # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures
    # dictionary key = 'sentences', value = list of sentences
    # each sentence dictionary with key='lemmas', 'tokens', etc
    # key = 'lemmas', value = list of lemmas

    for field in ("quote", "description"):
        parsed = proc.parse_doc(row[field])["sentences"]
        sentences = [sentence["lemmas"] for sentence in parsed]

        # flatten nested list so each element is a token
        row_tokenized = [
            token.strip() for sentence in sentences for token in sentence
            if token.strip() not in stop and pattern.match(token.strip())
        ]
        row_string = " ".join(row_tokenized)
        row[field] = row_string
        #row[field] = row_tokenized

print(data_names[0])
print(data_lemmas[0])

df_data_lemmas = pandas.DataFrame(
예제 #28
0
def main():
    lines = getALines()
    # format folds inputs
    fsplits = open("splits")
    slines = fsplits.readlines()
    splits = list()
    for i in range(0, len(slines)):
        parts = slines[i].strip().split(":")
        train = list()
        test = list()
        for s in parts[0][1:-1].split(", "):
            train.append(int(s))
        for s in parts[1][1:-1].split(", "):
            test.append(int(s))
        splits.append((train, test))
    fsplits.close()

    print("Number of folds: " + str(NUM_SPLITS))
    fdict = open("sentiment_dictionary", "r")
    cv = pickle.loads(fdict.read())
    fdict.close()
    foutput = open("nlu_scores", "w")

    for fold in range(0, NUM_SPLITS):
        #for evaluation
        scores = {
            j: {i: 0
                for i in ['correct', 'guessed', 'actual']}
            for j in ENT_TYPES
        }
        #get utterances
        in_utter = getUtterances(lines)
        take_utter = list()
        for i in range(0, len(in_utter)):
            if i in splits[fold][1]:
                take_utter.append(in_utter[i])
        in_utter = take_utter

        fclf = open("classifiers/sentiment_classifier" + str(fold), "r")
        clf = pickle.loads(fclf.read())
        fclf.close()

        proc = CoreNLP("pos", corenlp_jars=[PATH_TO_STANFORD_CORENLP])
        tagger = pycrfsuite.Tagger()
        tagger.open("taggers/advising_crf_tagger" + str(fold))

        #classify utterances
        for k in range(0, len(in_utter)):
            print("Current Utterance: " + in_utter[k][0])
            #get slots from utterance
            slots = getSlots(in_utter[k])
            print("Slots: " + str(slots))
            #constituency parse
            parsed = proc.parse_doc(in_utter[k][0])
            #print(parsed)
            #print(str(list(parsed['sentences'][0]['tokens'])))
            print("\n\n\n")
            print("Number of parsed sentences: " +
                  str(len(parsed['sentences'])))
            spos_tlist = list()
            for i in range(0, len(parsed['sentences'])):
                spos_tuples = zip(parsed['sentences'][i]['tokens'],
                                  parsed['sentences'][i]['pos'])
                spos_tlist.append(spos_tuples)
            X_test = [
                crf_tagger.sent2featuresWithSent(s, in_utter[k][0])
                for s in spos_tlist
            ]
            y_pred = [tagger.tag(xseq) for xseq in X_test]

            print(parsed['sentences'][0]['tokens'])
            print(y_pred[0])

            ent_list = {i: [] for i in ENT_TYPES}
            for i in range(0, len(parsed['sentences'])):
                etemp = getEntities(parsed['sentences'][i]['tokens'],
                                    y_pred[i])
                for etype in ENT_TYPES:
                    ent_list[etype].extend(etemp[etype])

            for i in ENT_TYPES:
                print(i + ': ' + str(ent_list[i]))

            ent_outs = {i: [] for i in ENT_TYPES}
            for etype in ENT_TYPES:
                for i in range(len(ent_list[etype])):
                    ent_outs[etype].append(
                        getClassLabel(in_utter[k][0], ent_list[etype][i],
                                      y_pred[0],
                                      parsed['sentences'][0]['tokens'], cv,
                                      clf))

            for etype in ENT_TYPES:
                #generate tuples for comparison of classes
                tlist = list()
                seval = list()
                for q in range(len(slots[etype])):
                    scores[etype]['actual'] += 1
                    ent_t = {
                        k: v
                        for k, v in slots[etype][q].items()
                        if k in ENT_TYPES[etype]
                    }
                    tlist.append(ent_t)
                    seval.append(slots[etype][q]['sentiment'])
                for i in range(len(ent_outs[etype])):
                    scores[etype]['guessed'] += 1
                    if ent_list[etype][i] in tlist:
                        if seval[tlist.index(
                                ent_list[etype][i])] == ent_outs[etype][i]:
                            scores[etype]['correct'] += 1

            print('current scores: ' + str(scores))

            #print output
            print("\n\nInput: " + in_utter[k][0])
            print("Output: ")
            for etype in ENT_TYPES:
                for i in range(len(ent_list[etype])):
                    print(etype + ': ' + str(ent_list[etype][i]) + " - " +
                          str(ent_outs[etype][i]))

        precision = sum([scores[i]['correct'] for i in ENT_TYPES]) * 1.0 / sum(
            [scores[i]['guessed'] for i in ENT_TYPES])
        recall = sum([scores[i]['correct'] for i in ENT_TYPES]) * 1.0 / sum(
            [scores[i]['actual'] for i in ENT_TYPES])
        ith_row = [precision, recall]

        for i in ENT_TYPES:
            tprecision = scores[i]['correct'] * 1.0 / scores[i]['guessed']
            trecall = scores[i]['correct'] * 1.0 / scores[i]['actual']
            ith_row.append(tprecision)
            ith_row.append(trecall)

        foutput.write(str(ith_row) + '\n')
    foutput.close()
예제 #29
0
파일: ner.py 프로젝트: edvisees/EDL2015
class NER:
    def __init__(self, lang):
        self.lang = lang
        self.config = ner_config

    def start_server(self):
        self.corenlp = CoreNLP(
            corenlp_jars=[
                os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]),
                os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]),
            ],
            server_port=self.config[self.lang]["port"],
            configdict=self.config[self.lang]["properties"],
        )
        print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"])

    # text = [paragraphs] (one per line)
    def query(self, text):
        if self.lang == "CMN":
            return self.stanford_ner(text)
        if self.lang == "SPA":
            return self.freeling_ner(text)
        if self.lang == "ENG":
            return self.stanford_ner(text)

    def stanford_ner(self, text):
        mentions = []
        for paragraph in text:
            paragraph_mentions = []
            response = self.corenlp.parse_doc(paragraph)
            sentences = response["sentences"]
            # print '\n\n', paragraph
            for sentence in sentences:
                paragraph_mentions.extend(self.process_stanford_sentence(sentence))
            mentions.append(paragraph_mentions)
        return mentions

    def process_stanford_sentence(self, sentence):
        mentions = []
        for index, word in enumerate(sentence["tokens"]):
            ner_type = sentence["ner"][index]
            if ner_type in stanford_good_entity_types:
                if index > 0 and sentence["ner"][index - 1] == ner_type:
                    # concat this token with the previous
                    mentions[-1].word += (
                        " " + word
                    )  # TODO: this is buggy, think of a better way (perhaps using the offsets and sentence.substring(start, end))
                    mentions[-1].end = sentence["char_offsets"][index][1]
                else:
                    mentions.append(
                        Mention(
                            word,
                            sentence["char_offsets"][index][0],
                            sentence["char_offsets"][index][1],
                            ner_type,
                            "name",
                            "link",
                        )
                    )
        return mentions

    def freeling_ner(self, text, name):
        print "\n\nINPUT TEXT:", text
        entities = get_entities(text)
        mentions = []
        # build Mentions
        for (form, count, classification) in entities:
            print "FREELING FOUND: %s: %d | %s" % (form, count, classification)
            # word, begin, end, ner, name, link
            mentions.append(Mention(form, 0, 1, classification, "name", "link"))
        return mentions
예제 #30
0
class Postprocessing(object):
    def __init__(self, fdev, ftest):
        self.proc = CoreNLP('ssplit')

        self.get_results(fdev, ftest)

        # DEV
        dev_order, dev_gold = [], []
        DEV_DIR = u'../data/dev'
        for dir in os.listdir(DEV_DIR):
            if dir != u'.DS_Store':
                f = os.path.join(DEV_DIR, dir)
                for fname in os.listdir(f):
                    if fname != u'.DS_Store':
                        print os.path.join(f, fname)
                        _order, _gold = self.order(os.path.join(f, fname), u'dev')
                        dev_order.extend(_order)
                        dev_gold.extend(_gold)
        self.write_hyps(dev_order, fdev + '.ordered')

        utils.write_references('results/gold/dev.en', dev_gold)

        # TEST
        test_order, test_gold = [], []
        TEST_FILE = u'../data/test/triples/test.xml'
        _order, _gold = self.order(TEST_FILE, u'test')
        test_order.extend(_order)
        self.write_hyps(test_order, ftest + '.ordered')

        # save previous orders
        self.save_prev_order()

    def get_results(self, fdev, ftest):
        def read_file(fname):
            f = open(fname)
            doc = f.read()
            f.close()

            return doc.split('\n')

        # development set
        _set = u'dev'
        entries = Entry.objects(set=_set)

        devresults = read_file(fdev)

        self.dev_order, self.dev_key_order = {}, []
        self.dev_gold = {}
        for i, entry in enumerate(entries):
            semcategory = entry.category
            size = entry.size
            docid = entry.docid
            self.dev_order[(docid, size, semcategory, _set)] = devresults[i]
            self.dev_key_order.append([docid, size, semcategory, _set])

            texts = map(lambda x: x.text, entry.texts)
            self.dev_gold[(docid, size, semcategory, _set)] = texts

        # test set
        _set = u'test'
        entries = Entry.objects(set=_set)

        testresults = read_file(ftest)

        self.test_order, self.test_key_order = {}, []
        for i, entry in enumerate(entries):
            docid= entry.docid
            self.test_order[docid] = testresults[i]
            self.test_key_order.append(docid)

    def order(self, fname, _set):
        tree = ET.parse(fname)
        root = tree.getroot()

        entries = root.find('entries')

        order = []
        gold = []

        for _entry in entries:
            docid = _entry.attrib['eid']
            size = int(_entry.attrib['size'])
            semcategory = _entry.attrib['category']

            if _set == u'dev':
                # print (docid, size, semcategory, _set)
                # print self.dev_order[(docid, size, semcategory, _set)]
                # print 10 * '*'

                order.append(self.dev_order[(docid, size, semcategory, _set)])
                gold.append(self.dev_gold[(docid, size, semcategory, _set)])
            else:
                order.append(self.test_order[docid])
        return order, gold

    def save_prev_order(self):
        f = open('results/dev_prev_order.txt', 'w')
        for prev in self.dev_key_order:
            f.write('\t'.join(map(lambda x: str(x), prev)))
            f.write('\n')
        f.close()

        f = open('results/test_prev_order.txt', 'w')
        for prev in self.test_key_order:
            f.write(prev)
            f.write('\n')
        f.close()

    def write_hyps(self, order, fname):
        f = open(fname, 'w')
        for text in order:
            out = self.proc.parse_doc(text)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace('-RRB-', ')')
                text += ' '

            f.write(text.encode('utf-8'))
            f.write('\n')
        f.close()
예제 #31
0
def write_references(fname, refs):
    proc = CoreNLP('ssplit')

    f1 = open(fname + '1', 'w')
    f2 = open(fname + '2', 'w')
    f3 = open(fname + '3', 'w')
    f4 = open(fname + '4', 'w')
    f5 = open(fname + '5', 'w')
    f6 = open(fname + '6', 'w')
    f7 = open(fname + '7', 'w')

    for references in refs:
        out = proc.parse_doc(references[0].lower())
        text = ''
        for snt in out['sentences']:
            text += ' '.join(snt['tokens']).replace('-LRB-',
                                                    '(').replace('-RRB-', ')')
            text += ' '

        f1.write(text.encode('utf-8'))
        f1.write('\n')

        if len(references) >= 2:
            out = proc.parse_doc(references[1].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '

            f2.write(text.encode('utf-8'))
        f2.write('\n')

        if len(references) >= 3:
            out = proc.parse_doc(references[2].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            f3.write(text.encode('utf-8'))
        f3.write('\n')

        if len(references) >= 4:
            out = proc.parse_doc(references[3].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            f4.write(text.encode('utf-8'))
        f4.write('\n')

        if len(references) >= 5:
            out = proc.parse_doc(references[4].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            f5.write(text.encode('utf-8'))
        f5.write('\n')

        if len(references) >= 6:
            out = proc.parse_doc(references[5].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            f6.write(text.encode('utf-8'))
        f6.write('\n')

        if len(references) >= 7:
            out = proc.parse_doc(references[6].lower())
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            f7.write(text.encode('utf-8'))
        f7.write('\n')

    f1.close()
    f2.close()
    f3.close()
    f4.close()
    f5.close()
    f6.close()
    f7.close()
예제 #32
0
#mport re
import numpy as np
from stanford_corenlp_pywrapper import CoreNLP

#Loading the Stanford CoreNLP Lib

data = "./extracted-quest/quest-en.txt"
loc= "/people/panou/Stage/projet/stanford-corenlp-full-2015-04-20/*"


#STOPWORDS is the list of words we'd like to discards in our 
stopwords =[".","?","!",',']

proc = CoreNLP("nerparse",corenlp_jars=[loc])
p=[]
i=1
with open(data,'r') as inp:
    for line in inp:
        print "traitement de la ligne " + str(i)
        p.append(proc.parse_doc(line))
        i+=1

with open('./phrases.txt','w') as out:
    for elmt in p:
        #print elmt["sentences"][0]["tokens"]
        for tok in elmt["sentences"][0]["lemmas"]:
            if not tok in stopwords: 
                out.write(tok+'\n')
        out.write('\n')

예제 #33
0
class ManualDelexicalizer(object):
    def __init__(self, fname, _set='train'):
        self.proc = CoreNLP('parse')
        self._set = _set

        f = open(fname)
        doc = f.read()
        f.close()

        doc = doc.split((50 * '*') + '\n')

        print 'Doc size: ', len(doc)

        for entry in doc:
            entry = entry.split('\n\n')

            _, entryId, size, semcategory = entry[0].replace('\n', '').split()

            entity_map = dict(
                map(lambda entity: entity.split(' | '),
                    entry[2].replace('\nENTITY MAP\n', '').split('\n')))

            lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1]

            for lex in lexEntries:
                if lex[0] == '\n':
                    lex = lex[1:]
                lex = lex.split('\n')

                lexId = lex[0]
                text = lex[1].replace('TEXT: ', '').strip()
                template = lex[2].replace('TEMPLATE: ', '')
                correct = lex[3].replace('CORRECT: ', '').strip()
                comment = lex[4].replace('COMMENT: ', '').strip()

                if comment in ['g', 'good']:
                    print template
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, template)
                    references = self.process_references(
                        text, template, entity_map)
                    self.save_references(references)
                elif correct != '' and comment != 'wrong':
                    print correct
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, correct)
                    references = self.process_references(
                        text, correct, entity_map)
                    self.save_references(references)

    def _get_references_info(self, out, entities):
        '''
        Get syntactic position, text and sentence status of the references based on dependency parser
        :param out: stanford corenlp result
        :param entities: tag - wikipedia id mapping
        :return:
        '''
        references = []
        for tag_entity in entities.iteritems():
            tag, entity = tag_entity
            refs, entity_removals = ref_delex.get_references(out, tag, entity)

            references.extend(refs)

        references = sorted(references,
                            key=lambda x:
                            (x['entity'], x['sentence'], x['pos']))

        sentence_statuses = {}
        for i, reference in enumerate(references):
            if i == 0 or (reference['entity'] != references[i - 1]['entity']):
                reference['text_status'] = 'new'
            else:
                reference['text_status'] = 'given'

            if reference['sentence'] not in sentence_statuses:
                sentence_statuses[reference['sentence']] = []

            if reference['entity'] not in sentence_statuses[
                    reference['sentence']]:
                reference['sentence_status'] = 'new'
            else:
                reference['sentence_status'] = 'given'

            sentence_statuses[reference['sentence']].append(
                reference['entity'])

        references = sorted(references, key=lambda x: x['general_pos'])
        return references

    def _get_refexes(self, text, template, references):
        '''
        Extract referring expressions for each reference overlapping text and template
        :param text: original text
        :param template: template (delexicalized text)
        :param references: references
        :return:
        '''
        text = 'BEGIN BEGIN BEGIN ' + text
        template = 'BEGIN BEGIN BEGIN ' + template

        isOver = False
        while not isOver:
            stemplate = template.split()

            tag = ''
            pre_tag, pos_tag, i = [], [], 0
            for token in stemplate:
                i += 1
                if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
                    tag = token
                    for pos_token in stemplate[i:]:
                        if pos_token.split('-')[0] in [
                                'AGENT', 'PATIENT', 'BRIDGE'
                        ]:
                            break
                        else:
                            pos_tag.append(pos_token)
                    break
                else:
                    pre_tag.append(token)

            if tag == '':
                isOver = True
            else:
                regex = re.escape(' '.join(
                    pre_tag[-3:]).strip()) + ' (.+?) ' + re.escape(' '.join(
                        pos_tag[:3]).strip())
                f = re.findall(regex, text)

                if len(f) > 0:
                    refex = f[0]
                    template = template.replace(tag, refex, 1)

                    ref_type = 'name'
                    if refex.lower().strip() in [
                            'he', 'his', 'him', 'she', 'hers', 'her', 'it',
                            'its', 'they', 'theirs', 'them'
                    ]:
                        ref_type = 'pronoun'
                    elif refex.lower().strip().split()[0] in [
                            'the', 'a', 'an'
                    ]:
                        ref_type = 'description'
                    elif refex.lower().strip().split()[0] in [
                            'this', 'these', 'that', 'those'
                    ]:
                        ref_type = 'demonstrative'

                    for ref in references:
                        if ref['tag'] == tag and 'refex' not in ref:
                            ref['refex'] = refex
                            ref['reftype'] = ref_type
                            break
                else:
                    template = template.replace(tag, ' ', 1)
        return references

    def update_template(self, entryId, size, semcategory, _set, lexId,
                        template):
        entry = Entry.objects(docid=entryId,
                              size=size,
                              category=semcategory,
                              set=_set).first()

        for lexEntry in entry.texts:
            if lexEntry.docid == lexId:
                dbop.insert_template(lexEntry, template, 'manual')
                break

    def save_references(self, references):
        '''
        Save references and referring expressions extracted from the manual annotation
        :param references:
        :return:
        '''
        for reference in references:
            if 'refex' in reference:
                ref = dbop.save_reference(
                    entity=reference['entity'],
                    syntax=reference['syntax'],
                    text_status=reference['text_status'],
                    sentence_status=reference['sentence_status'])

                dbop.add_refex(ref, reference['reftype'], reference['refex'],
                               'manual')

    def process_references(self, text, template, entities):
        '''
        Obtain information of references and their referring expressions
        :param text:
        :param template:
        :param entities:
        :return:
        '''
        out = self.proc.parse_doc(text)
        text = []
        for i, snt in enumerate(out['sentences']):
            text.extend(snt['tokens'])
        text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-',
                                                             ')').strip()

        out = self.proc.parse_doc(template)['sentences']
        references = self._get_references_info(out, entities)
        references = self._get_refexes(text, template, references)
        return references
예제 #34
0
from stanford_corenlp_pywrapper import CoreNLP
from nltk import *
import os

proc = CoreNLP("parse",
               corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])

#correct subdirectory by coded type goes here
#comment all this to do a single text file instead of a directory
path = 'data/engelhard/A/'
for filename in os.listdir(path):
    print(filename)
    with open(path + filename, 'rU') as f:
        engelhard = f.read()
        engelhard2 = engelhard.decode('utf8', 'ignore')
        trees = proc.parse_doc(engelhard2)
        print(engelhard2)

    #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like:
    #ssplit for tokenization and sentence splitting
    #pos for pos and lemmas
    #ner for pos and ner and lemmas
    #parse for pos, lemmas, trees, dependencies
    #nerparse for parsing with ner, pos, lemmas, dependencies
    #coref for coreference including constituent parsing

    #comment this to do coref
    trees = proc.parse_doc(engelhard2)
    #print(trees)

    #uncomment this to do coref
예제 #35
0
class DBInit(object):
    def __init__(self):
        self.proc = CoreNLP('parse')
        self.ner = json.load(open('../data/delexicalization/ner_dict.json'))
        self.semcategory = json.load(
            open('../data/delexicalization/delex_dict.json'))
        self.descriptions = json.load(
            open('../data/delexicalization/descriptions.json'))

    def run(self, dir, typeset):
        self.typeset = typeset

        for fname in os.listdir(dir):
            if fname != '.DS_Store':
                self.proc_file(os.path.join(dir, fname))

    def extract_entity_type(self, entity):
        aux = entity.split('^^')
        if len(aux) > 1:
            return aux[-1]

        aux = entity.split('@')
        if len(aux) > 1:
            return aux[-1]

        return 'wiki'

    def get_entity_info(self, entity):
        fner = filter(lambda key: entity in self.ner[key], self.ner)
        fsemcategory = filter(lambda key: entity in self.semcategory[key],
                              self.semcategory)
        fdescription = filter(lambda key: entity in self.descriptions[key],
                              self.descriptions)

        if len(fner) > 0:
            fner = fner[0]
        else:
            fner = ''

        if len(fsemcategory) > 0:
            fsemcategory = fsemcategory[0]
        else:
            fsemcategory = ''

        if len(fdescription) > 0:
            fdescription = fdescription[0]
        else:
            fdescription = ''

        return fner, fsemcategory, fdescription

    def extract_parse_tree(self, text):
        out = self.proc.parse_doc(text)

        parse_trees = []
        for snt in out['sentences']:
            parse_trees.append(snt['parse'])

        if len(parse_trees) > 1:
            parse = '(MULTI-SENTENCE '
            for tree in parse_trees:
                parse += tree + ' '
            parse = parse.strip() + ')'
        else:
            parse = parse_trees[0]
        return parse

    def proc_file(self, fname):
        tree = ET.parse(fname)
        root = tree.getroot()

        entries = root.find('entries')

        for _entry in entries:
            entry = dbop.save_entry(docid=_entry.attrib['eid'],
                                    size=int(_entry.attrib['size']),
                                    category=_entry.attrib['category'],
                                    set=self.typeset)

            entities_type = []

            # Reading original triples to extract the entities type
            otripleset = _entry.find('originaltripleset')
            for otriple in otripleset:
                e1, pred, e2 = otriple.text.split(' | ')

                entity1_type = self.extract_entity_type(e1.strip())
                entity2_type = self.extract_entity_type(e2.strip())

                types = {'e1_type': entity1_type, 'e2_type': entity2_type}
                entities_type.append(types)

            # Reading modified triples to extract entities and predicate
            mtripleset = _entry.find('modifiedtripleset')
            for i, mtriple in enumerate(mtripleset):
                e1, pred, e2 = mtriple.text.split(' | ')

                ner, semcategory, description = self.get_entity_info(e1)
                entity1 = dbop.save_entity(name=e1.replace('\'', '').strip(),
                                           type=entities_type[i]['e1_type'],
                                           ner=ner,
                                           category=semcategory,
                                           description=description)

                predicate = dbop.save_predicate(pred)

                ner, semcategory, description = self.get_entity_info(e2)
                entity2 = dbop.save_entity(e2.replace('\'', '').strip(),
                                           entities_type[i]['e2_type'],
                                           ner=ner,
                                           category=semcategory,
                                           description=description)

                triple = dbop.save_triple(entity1, predicate, entity2)

                dbop.add_triple(entry, triple)

            # process lexical entries
            lexEntries = _entry.findall('lex')
            for lexEntry in lexEntries:
                text = lexEntry.text.strip()
                parse_tree = self.extract_parse_tree(text)
                lexEntry = dbop.save_lexEntry(
                    docid=lexEntry.attrib['lid'],
                    comment=lexEntry.attrib['comment'],
                    text=text,
                    parse_tree=parse_tree)

                dbop.add_lexEntry(entry, lexEntry)
예제 #36
0
os.chdir(os.path.join(os.getcwd(),"bio_output"))

for i in range(len(doc)):
    files[file_names[i]] = doc[i].replace("\n"," ")
    file = open(file_names[i]+".txt", "w")
    file.write(doc[i])
    file.close()

#%% check set of non-word characters and stopword
os.chdir("/Users/apple/Documents/MSiA/Fall 2015/Text analytics/HW/hw3")

proc = CoreNLP("pos", corenlp_jars=["/Users/apple/corenlp/stanford-corenlp-full-2015-04-20/*"])

for i in files.keys():
    text = files[i]
    parsed = proc.parse_doc(text)
    to_flat = [x["lemmas"] for x in parsed["sentences"]]
    words = [item for sublist in to_flat for item in sublist]
    files[i] = words
#%%
#import stopwords from nltk stopwords, add 'I' since it is also counted as a stopword
stopWord = set(stopwords.words('english'))
stopWord.add("I")
all_words = []
nonWords = re.compile(r"^\b[a-zA-Z]+-?[a-zA-z]+$")

for i in files.keys():
    text = files[i]
    words = []
    for w in text:
        if nonWords.match(w):
예제 #37
0
            if prev_tok_char_end < tok_char_start:
                output.append(u" ")
        assert isinstance(tok, unicode)
        output.append(tok)
    return u"".join(output)


with open ("corpora/" + args.corpus + "/raw/all.extract") as raw:
    count = 0
    for line in csv.reader(raw, delimiter="\t"):
        count += 1
        out = {}
        pubdate = line[1]
        headline = line[4]
        print headline
        text = proc.parse_doc(un_html_ify(line[5]))
        try:
            url = line[6]
        except IndexError:
            url = "unknown"
        for ln in text["sentences"]:
            ln["as_string"] = sent_to_string(ln)
            ln["phrases"] = get_phrases(ln["pos"], ln["tokens"])
        out["pubdate"] = pubdate
        out["headline"] = headline
        out["text"] = text
        out["url"] = url
        with open('corpora/' + args.corpus + '/processed/all.anno_plus', 'a') as outfile:
            json.dump(out, outfile)
            outfile.write("\n")
예제 #38
0
from stanford_corenlp_pywrapper import CoreNLP
import os
proc = CoreNLP(
    "ner",
    corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"])
input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset'
output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv'
#parse files
output = open(output_path, 'w')
for filename in os.listdir(input_path):
    try:
        input_file = open(filename, 'r')
        x = input_file.read()
        out = proc.parse_doc(x)
        ner_tags = out['sentences'][0]['ner']
        num_tokens = len(ner_tags)
        lemmas = out['sentences'][0]['lemmas']
        first_indexes = (i for i in xrange(num_tokens)
                         if ner_tags[i] == "PERSON" and (
                             i == 0 or ner_tags[i - 1] != "PERSON"))
        for begin_index in first_indexes:
            # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
            end_index = begin_index + 1
            while end_index < num_tokens and ner_tags[end_index] == "PERSON":
                end_index += 1
            end_index -= 1
            mention_text = " ".join(
                map(lambda i: lemmas[i], xrange(begin_index, end_index + 1)))
            print("%s %s" % (filename, mention_text))
            output.write("%s\n" % mention_text)
    except IndexError:
예제 #39
0
def get_core_nlp_parse(question):
    proc = CoreNLP("nerparse",
                   corenlp_jars=[stanford_corenlp_path + "/" + "*"])
    core_nlp_parse = proc.parse_doc(question)
    return core_nlp_parse
p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref'}, 
            #output_types=['pos','parse'],
            corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])
            
            
data_lemmas = copy.deepcopy(data_names) # deep copy otherwise change data_clean since list of objects

# lemmatize quotes and description
for row in data_lemmas:
    # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures
    # dictionary key = 'sentences', value = list of sentences
    # each sentence dictionary with key='lemmas', 'tokens', etc
    # key = 'lemmas', value = list of lemmas 
    
    for field in ("quote","description"):
        parsed = proc.parse_doc(row[field])["sentences"]
        sentences = [sentence["lemmas"] for sentence in parsed]
        
        # flatten nested list so each element is a token
        row_tokenized = [token.strip() for sentence in sentences for token in sentence
                         if token.strip() not in stop and pattern.match(token.strip())]
        row_string = " ".join(row_tokenized)
        row[field] = row_string
        #row[field] = row_tokenized           
        
print(data_names[0])
print(data_lemmas[0])

df_data_lemmas = pandas.DataFrame(data_lemmas, columns = ["time","character", "quote", "location", "description"])            
df_data_lemmas.to_csv("data_lemmas.csv", sep=",", header=True, index=False, encoding = 'utf-8')
      
예제 #41
0
class Preprocessing(object):
    def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test):
        self.proc = CoreNLP('ssplit')
        self.parser = CoreNLP('parse')
        self.in_train = in_train
        self.in_dev = in_dev

        self.out_vocab = out_vocab
        self.out_train = out_train
        self.out_dev = out_dev
        self.out_test = out_test

        self.text_id = 0
        self.trainset()
        self.testset()


    def trainset(self):
        input_vocab, output_vocab, character_vocab = set(), set(), set()
        train, dev = [], []
        train_info, dev_info = [], []

        dirs = filter(lambda x: x != '.DS_Store', os.listdir(self.in_train))
        for path in dirs:
            dirs2 = filter(lambda x: x != '.DS_Store', os.listdir(os.path.join(self.in_train, path)))
            for fname in dirs2:
                f = open(os.path.join(self.in_train, path, fname))

                data, in_vocab, out_vocab, c_vocab = self.annotation_parse(f)

                input_vocab = input_vocab.union(in_vocab)
                output_vocab = output_vocab.union(out_vocab)
                character_vocab = character_vocab.union(c_vocab)

                text_ids = list(set(map(lambda x: x['text_id'], data)))

                train_size = int(0.9 * len(text_ids))

                random.shuffle(text_ids)
                train.extend(filter(lambda x: x['text_id'] in text_ids[:train_size], data))
                dev.extend(filter(lambda x: x['text_id'] in text_ids[train_size:], data))

                info = len(train) * [path + ' ' + fname]
                train_info.extend(info)

                info = len(dev) * [path + ' ' + fname]
                dev_info.extend(info)

        self.write(self.out_train, train, train_info)
        self.write(self.out_dev, dev, dev_info)

        with open(os.path.join(self.out_vocab, 'input_vocab.txt'), 'w') as f:
            f.write(('\n'.join(list(input_vocab))).encode("utf-8"))

        with open(os.path.join(self.out_vocab, 'output_vocab.txt'), 'w') as f:
            f.write(('\n'.join(list(output_vocab))).encode("utf-8"))

        with open(os.path.join(self.out_vocab, 'character_vocab.txt'), 'w') as f:
            f.write(('\n'.join(list(character_vocab))).encode("utf-8"))


    def testset(self):
        test = []
        test_info = []

        dirs = filter(lambda x: x != '.DS_Store', os.listdir(self.in_dev))
        for path in dirs:
            dirs2 = filter(lambda x: x != '.DS_Store', os.listdir(os.path.join(self.in_dev, path)))
            for fname in dirs2:
                f = open(os.path.join(self.in_dev, path, fname))

                data, in_vocab, out_vocab, c_vocab = self.annotation_parse(f)

                test.extend(data)

                info = len(data) * [path + ' ' + fname]
                test_info.extend(info)

        self.write(self.out_test, test, test_info)

    def extract_entity_type(self, entity):
        aux = entity.split('^^')
        if len(aux) > 1:
            return aux[-1]

        aux = entity.split('@')
        if len(aux) > 1:
            return aux[-1]

        return 'wiki'

    def annotation_parse(self, doc):
        '''
        Parse an annotation document and extract references from the texts
        :param doc:
        :return:
        '''
        tree = ET.parse(doc)
        root =  tree.getroot()

        data = []
        input_vocab, output_vocab, character_vocab = set(), set(), set()

        entries = root.find('entries')
        for entry in entries:
            entryId = entry.attrib['eid']
            size = entry.attrib['size']
            semcategory = entry.attrib['category']

            # get entity map
            entitymap_xml = entry.find('entitymap')
            entity_map = {}
            for inst in entitymap_xml:
                tag, entity = inst.text.split(' | ')
                entity_map[tag] = entity

            # Reading original triples to extract the entities type
            types = []
            otripleset = entry.find('originaltripleset')
            for otriple in otripleset:
                e1, pred, e2 = otriple.text.split(' | ')

                entity1_type = self.extract_entity_type(e1.strip())
                entity2_type = self.extract_entity_type(e2.strip())

                types.append({'e1_type':entity1_type, 'e2_type':entity2_type})

            # Reading modified triples to extract entities and classify them according to type
            mtripleset = entry.find('modifiedtripleset')
            entity_type = {}
            for i, mtriple in enumerate(mtripleset):
                e1, pred, e2 = mtriple.text.split(' | ')

                entity_type[e1.replace('\'', '')] = types[i]['e1_type']
                entity_type[e2.replace('\'', '')] = types[i]['e2_type']

            lexEntries = entry.findall('lex')

            for lex in lexEntries:
                try:
                    text = lex.find('text').text
                    template = lex.find('template').text

                    if template:
                        print('{}\r'.format(template))
                        text, template = self.stanford_parse(text, template)
                        references, in_vocab, out_vocab, c_vocab = self.get_refexes(text, template, entity_map, entity_type)
                        data.extend(references)
                        input_vocab = input_vocab.union(in_vocab)
                        output_vocab = output_vocab.union(out_vocab)
                        character_vocab = character_vocab.union(c_vocab)
                except Exception as e:
                    print('ERROR')
                    print(e.message)

        return data, input_vocab, output_vocab, character_vocab


    def stanford_parse(self, text, template):
        '''
        Tokenizing text and template
        :param text: original text
        :param template: original template
        :return: Tokenized text and template
        '''
        out = self.proc.parse_doc(text)
        text = []
        for i, snt in enumerate(out['sentences']):
            text.extend(snt['tokens'])
        text = ' '.join(text).replace('-LRB-', '(').replace('-RRB-', ')').strip()

        out = self.proc.parse_doc(template)
        temp = []
        for i, snt in enumerate(out['sentences']):
            temp.extend(snt['tokens'])
        template = ' '.join(temp).replace('-LRB-', '(').replace('-RRB-', ')').strip()

        return text, template


    def write(self, fname, instances, info):
        if not os.path.exists(fname):
            os.mkdir(fname)

        pre_context = '\n'.join(map(lambda x: x['pre_context'], instances)).encode('utf-8')
        with open(os.path.join(fname, 'pre_context.txt'), 'w') as f:
            f.write(pre_context)
        pos_context = '\n'.join(map(lambda x: x['pos_context'], instances)).encode('utf-8')
        with open(os.path.join(fname, 'pos_context.txt'), 'w') as f:
            f.write(pos_context)
        entity = '\n'.join(map(lambda x: x['entity'], instances)).encode('utf-8')
        with open(os.path.join(fname, 'entity.txt'), 'w') as f:
            f.write(entity)
        refex = '\n'.join(map(lambda x: x['refex'], instances)).encode('utf-8')
        with open(os.path.join(fname, 'refex.txt'), 'w') as f:
            f.write(refex)
        size = '\n'.join(map(lambda x: str(x['size']), instances))
        with open(os.path.join(fname, 'size.txt'), 'w') as f:
            f.write(size)
        info = '\n'.join(info).encode('utf-8')
        with open(os.path.join(fname, 'info.txt'), 'w') as f:
            f.write(info)

        p.dump(instances, open(os.path.join(fname, 'data.cPickle'), 'w'))


    def get_reference_info(self, template, tag):
        '''
        get info about a reference like syntactic position
        :param out: stanford corenlp result
        :param tag: tag (agent, patient or bridge)
        :param entity: wikipedia id
        :return:
        '''
        out = self.parser.parse_doc(template)['sentences']
        reference = {'syntax':'', 'sentence':-1, 'pos':-1, 'general_pos':-1, 'tag':tag}
        general_pos = 0
        for i, snt in enumerate(out):
            deps = snt['deps_cc']
            for dep in deps:
                # get syntax
                if snt['tokens'][dep[2]] == tag:
                    reference = {'syntax':'', 'sentence':i, 'pos':dep[2], 'general_pos':general_pos+dep[2], 'tag':tag}
                    if 'nsubj' in dep[0] or 'nsubjpass' in dep[0]:
                        reference['syntax'] = 'np-subj'
                    elif 'nmod:poss' in dep[0] or 'compound' in dep[0]:
                        reference['syntax'] = 'subj-det'
                    else:
                        reference['syntax'] = 'np-obj'
                    break
            general_pos += len(snt['tokens'])
        return reference


    def process_template(self, template):
        '''
        Return previous and subsequent tokens from a specific tag in a template
        :param template:
        :return:
        '''
        stemplate = template.split()

        tag = ''
        pre_tag, pos_tag, i = [], [], 0
        for token in stemplate:
            i += 1
            if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
                tag = token
                for pos_token in stemplate[i:]:
                    if pos_token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
                        break
                    else:
                        pos_tag.append(pos_token)
                break
            else:
                pre_tag.append(token)
        return pre_tag, tag, pos_tag


    def process_context(self, context, entity_map):
        '''
        Return pre- and pos- wikified context
        :param context:
        :param entity_map:
        :return:
        '''
        scontext = context.split()
        pre_context, pos_context, i = [], [], 0
        for token in scontext:
            i += 1
            if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
                pos_context = scontext[i:]
                break
            else:
                pre_context.append(token)

        pre_context = ' '.join(['EOS'] + pre_context)
        pos_context = ' '.join(pos_context + ['EOS'])
        for tag, entity in entity_map.iteritems():
            # pre_context = pre_context.replace(tag, entity_map[tag])
            # pos_context = pos_context.replace(tag, entity_map[tag])
            pre_context = pre_context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()))
            pos_context = pos_context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()))

        return pre_context.lower(), pos_context.lower()


    def classify(self, references):
        '''
        Classify referring expression by their status and form
        :param references:
        :return:
        '''
        references = sorted(references, key=lambda x: (x['entity'], x['sentence'], x['pos']))

        sentence_statuses = {}
        for i, reference in enumerate(references):
            # text status
            if i == 0 or (reference['entity'] != references[i-1]['entity']):
                reference['text_status'] = 'new'
            else:
                reference['text_status'] = 'given'

            if reference['sentence'] not in sentence_statuses:
                sentence_statuses[reference['sentence']] = []

            # sentence status
            if reference['entity'] not in sentence_statuses[reference['sentence']]:
                reference['sentence_status'] = 'new'
            else:
                reference['sentence_status'] = 'given'

            sentence_statuses[reference['sentence']].append(reference['entity'])

            # referential form
            reg = reference['refex'].replace('eos', '').strip()
            reference['reftype'] = 'name'
            if reg.lower().strip() in ['he', 'his', 'him', 'she', 'hers', 'her', 'it', 'its', 'we', 'our', 'ours', 'they', 'theirs', 'them']:
                reference['reftype'] = 'pronoun'
            elif reg.lower().strip().split()[0] in ['the', 'a', 'an']:
                reference['reftype'] = 'description'
            elif reg.lower().strip().split()[0] in ['this', 'these', 'that', 'those']:
                reference['reftype'] = 'demonstrative'

        return references


    def get_refexes(self, text, template, entity_map, entity_type):
        '''
        Extract referring expressions for each reference overlapping text and template
        :param text: original text
        :param template: template (delexicalized text)
        :return:
        '''
        context = copy.copy(template)

        data, input_vocab, output_vocab, character_vocab = [], set(), set(), set()

        isOver = False
        while not isOver:
            pre_tag, tag, pos_tag = self.process_template(template)
            pre_context, pos_context = self.process_context(context, entity_map)

            if tag == '':
                isOver = True
            else:
                # Look for reference from 5-gram to 2-gram
                i, f = 5, []
                while i > 1:
                    begin = ' '.join(i * ['BEGIN'])
                    text = begin + ' ' + text
                    template = begin + ' ' + template
                    pre_tag, tag, pos_tag = self.process_template(template)

                    regex = re.escape(' '.join(pre_tag[-i:]).strip()) + ' (.+?) ' + re.escape(' '.join(pos_tag[:i]).strip())
                    f = re.findall(regex, text)

                    template = template.replace('BEGIN', '').strip()
                    text = text.replace('BEGIN', '').strip()
                    i -= 1

                    if len(f) == 1:
                        break

                if len(f) > 0:
                    # DO NOT LOWER CASE HERE!!!!!!
                    template = template.replace(tag, f[0], 1)
                    refex = f[0]

                    # Do not include literals
                    entity = entity_map[tag]
                    if entity_type[entity] == 'wiki':
                        normalized = '_'.join(entity.replace('\"', '').replace('\'', '').lower().split())
                        aux = context.replace(tag, 'ENTITY', 1)
                        reference = self.get_reference_info(aux, 'ENTITY')

                        character = ['eos'] + list(refex) + ['eos']
                        refex = ['eos'] + refex.split() + ['eos']
                        row = {
                            'pre_context':pre_context.replace('@', ''),
                            'pos_context':pos_context.replace('@', ''),
                            'entity':normalized,
                            'refex':' '.join(refex),
                            'size':len(entity_map.keys()),
                            'syntax':reference['syntax'],
                            'text_id':self.text_id,
                            'general_pos':reference['general_pos'],
                            'sentence':reference['sentence'],
                            'pos':reference['pos'],
                            'text':text
                        }
                        data.append(row)
                        output_vocab = output_vocab.union(set(refex))
                        character_vocab = character_vocab.union(set(character))
                        input_vocab = input_vocab.union(set(pre_context.split()))
                        input_vocab = input_vocab.union(set(pos_context.split()))
                        input_vocab = input_vocab.union(set([normalized]))

                        context = context.replace(tag, normalized, 1)
                    else:
                        context = context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()), 1)
                else:
                    template = template.replace(tag, ' ', 1)
                    context = context.replace(tag, '_'.join(entity_map[tag].replace('\"', '').replace('\'', '').lower().split()), 1)

        self.text_id += 1
        data = self.classify(data)
        return data, input_vocab, output_vocab, character_vocab
예제 #42
0
def get_parallel(set, delex=True, size=10, evaluation=False):
    entries = Entry.objects(size__lte=size, set=set)
    proc = CoreNLP('ssplit')

    de, en, entity_maps = [], [], []
    for entry in entries:
        entity_map, predicates = utils.map_entities(entry.triples)
        entity2tag = utils.entity2tag(entity_map)

        source = ''
        for triple in entry.triples:
            agent = triple.agent.name
            tag_agent = entity2tag[agent]

            predicate = triple.predicate.name

            patient = triple.patient.name
            tag_patient = entity2tag[patient]

            if delex:
                source += tag_agent
            else:
                source += agent
            source += ' '
            source += predicate
            source += ' '
            if delex:
                source += tag_patient
            else:
                source += patient
            source += ' '

            if not DELEX and set in ['train', 'dev'] and not evaluation:
                de.append(agent)
                name = ' '.join(
                    agent.replace('\'', '').replace('\"', '').split('_'))
                out = proc.parse_doc(name)
                text = ''
                for snt in out['sentences']:
                    text += ' '.join(snt['tokens']).replace('-LRB-',
                                                            '(').replace(
                                                                '-RRB-', ')')
                    text += ' '
                en.append(text.strip())

                de.append(patient)
                name = ' '.join(
                    patient.replace('\'', '').replace('\"', '').split('_'))
                out = proc.parse_doc(name)
                text = ''
                for snt in out['sentences']:
                    text += ' '.join(snt['tokens']).replace('-LRB-',
                                                            '(').replace(
                                                                '-RRB-', ')')
                    text += ' '
                en.append(text.strip())

        target_list = []
        for lexEntry in entry.texts:
            if delex and not evaluation:
                target = lexEntry.template
            else:
                target = lexEntry.text
            out = proc.parse_doc(target)

            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            target = text.strip()
            target_list.append(target)

            print source
            print target
            print 10 * '-'
            if not evaluation:
                entity_maps.append(entity_map)
                de.append(source.strip())
                en.append(target)
        if evaluation:
            entity_maps.append(entity_map)
            de.append(source.strip())
            en.append(target_list)
        elif set == 'test':
            entity_maps.append(entity_map)
            de.append(source.strip())
    return de, en, entity_maps
예제 #43
0
    f.write(line) # no need to add "/n" because already in string
f.close()

#%% Tokenize  ################################################################

# sentence segmentation of each line (each element is a list of sentences)
# Seprating by sentene is ok because bigrams like (".", "I") will be ignored
tokenized_lines = []     # each element is a list with tokens of a line
tokenized_sentences = [] # each element is a list with tokens of sentence

for line in text:
    # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures
    # dictionary key = 'sentences', value = list of sentences
    # each sentence dictionary with key='lemmas', 'tokens', etc
    # key = 'lemmas', value = list of lemmas 
    parsed = proc.parse_doc(line)["sentences"]
    sentences = [sentence["lemmas"] for sentence in parsed]
    
    # flatten nested list so each element is a token
    line_tokenized = [sentence for sublist in sentences for sentence in sublist]
    # add to list where each element is a tokenized line
    if line_tokenized != []:
        tokenized_lines.append(line_tokenized)
    
    # add to list where each element is a tokenized sentence
    for sentence in sentences:
        tokenized_sentences.append(sentence)

# save to file
len(tokenized_lines) # 182 lines
f = open(out_file_name_normalized_line, "w")
import glob
import json
from stanford_corenlp_pywrapper import CoreNLP

proc = CoreNLP(
    "pos", corenlp_jars=["/Users/ahandler/stanford-corenlp-full-2015-04-20/*"])

for fn in glob.glob("demos_congress/*txt"):
    with open(fn, "r") as inf:
        procd = proc.parse_doc(inf.read())
        with open(fn.replace(".txt", ".anno"), "w") as outf:
            outf.write(json.dumps(procd))

for fn in glob.glob("demos/*txt"):
    with open(fn, "r") as inf:
        procd = proc.parse_doc(inf.read())
        with open(fn.replace(".txt", ".anno"), "w") as outf:
            outf.write(json.dumps(procd))

for fn in glob.glob("demos_wilkerson/*txt"):
    with open(fn, "r") as inf:
        procd = proc.parse_doc(inf.read())
        with open(fn.replace(".txt", ".anno"), "w") as outf:
            outf.write(json.dumps(procd))
예제 #45
0
파일: token.py 프로젝트: rpongsaj/reflector
from stanford_corenlp_pywrapper import CoreNLP
from pprint import pprint
import glob

proc = CoreNLP("ssplit", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])
path = 'data/engelhard/0/'
for filename in glob.glob(path+'*.txt'):
  print(filename)
  with open(filename, 'rU') as f:
    engelhard = f.read()
    engelhard2 = engelhard.decode('utf8', 'ignore')
    print(engelhard2)
    a = proc.parse_doc(engelhard2)
  pprint(a['sentences'][0]['tokens'])
"""
Input is multiple text files.  Each text file represents one document.
Output is just as many text files, with the ".anno" extension instead.
Each output file consists of one JSON object.

USAGE
proc_text_files.py MODE  [files...]

e.g.
python proc_text_files.py pos *.txt
"""

import sys, re
mode = sys.argv[1]

from stanford_corenlp_pywrapper import CoreNLP
ss = CoreNLP(mode, corenlp_jars=["/Users/Doctor_Einstein/Documents/stockMartket/analysis/nlp/stanford/*"])

for filename in sys.argv[2:]:
    outfile = re.sub(r'\.txt$',"", filename) + ".anno"
    print>>sys.stderr, "%s  ->   %s" % (filename, outfile)

    text = open(filename).read().decode('utf8', 'replace')
    jdoc = ss.parse_doc(text, raw=True)
    with open(outfile, 'w') as fp:
        print>>fp, jdoc


예제 #47
0
파일: stparser.py 프로젝트: dkubo/legalNLP
def main():
	proc = CoreNLP("pos", corenlp_jars=["/home/is/daiki-ku/opt/stanford-corenlp-full-2016-10-31/*"])
	proc.parse_doc("hello world. how are you?")
예제 #48
0
     current_nonos += sum([1 for tok in cur_mapped if tok != 'O'])
     #print('Non-Os is now: ' + str(current_nonos))
     #print(cur_mapped)
     #print('\n\n\n')
     assert len(cur_mapped) == len(cur_pos) and len(cur_pos) == len(
         cur_parsed)
     for i in range(0, len(cur_mapped)):
         out_file.write(cur_parsed[i] + '\t' + cur_pos[i] + '\t' +
                        cur_mapped[i] + '\n')
     out_file.write('\n')
     in_annotations = False
     continue
 if not in_annotations:
     cur_line = line.replace('/', ' / ').replace('EECS', ' EECS ').replace(
         'eecs', ' eecs ').replace('  ', ' ')
     cur_parsed = proc.parse_doc(cur_line)
     cur_pttok, cur_postk = [], []
     for sent in cur_parsed['sentences']:
         cur_pttok.extend(sent['tokens'])
         cur_postk.extend(sent['pos'])
     cur_parsed = cur_pttok
     cur_pos = cur_postk
     #print(line)
     #print(cur_parsed)
     cur_mapped = ['O'] * len(cur_pttok)
     current_nonos = 0
     in_annotations = True
 else:
     # parse annotations
     anno = line[1:] if line.startswith('<') else line
     anno = anno[:-1] if line.endswith('>') else anno
예제 #49
0
from stanford_corenlp_pywrapper import CoreNLP
from pprint import pprint
import glob

proc = CoreNLP("ssplit",
               corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])
path = 'data/engelhard/0/'
for filename in glob.glob(path + '*.txt'):
    print(filename)
    with open(filename, 'rU') as f:
        engelhard = f.read()
        engelhard2 = engelhard.decode('utf8', 'ignore')
        print(engelhard2)
        a = proc.parse_doc(engelhard2)
    pprint(a['sentences'][0]['tokens'])