예제 #1
0
    def __init__(self, in_train, in_dev):
        self.proc = CoreNLP('ssplit')
        self.parser = CoreNLP('parse')
        self.in_train = in_train
        self.in_dev = in_dev

        self.text_id = 0
        self.trainset()
        self.testset()
예제 #2
0
    def __init__(self, _set='train', save_references=True):
        self._set = _set
        self.proc = CoreNLP('coref')

        self.proc_parse = CoreNLP('parse')

        self.e2f = utils.get_e2f('../data/lex.e2f')

        self.save_references = save_references
        # referring expressions per entity
        self.refexes = {}
예제 #3
0
    def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test):
        self.proc = CoreNLP('ssplit')
        self.parser = CoreNLP('parse')
        self.in_train = in_train
        self.in_dev = in_dev

        self.out_vocab = out_vocab
        self.out_train = out_train
        self.out_dev = out_dev
        self.out_test = out_test

        self.text_id = 0
        self.trainset()
        self.testset()
예제 #4
0
 def __init__(self):
     self.proc = CoreNLP('parse')
     self.ner = json.load(open('../data/delexicalization/ner_dict.json'))
     self.semcategory = json.load(
         open('../data/delexicalization/delex_dict.json'))
     self.descriptions = json.load(
         open('../data/delexicalization/descriptions.json'))
예제 #5
0
파일: parser.py 프로젝트: GauravG8/snowball
def get_parser():
    corenlp = CoreNLP(
        configdict={'annotators': 'tokenize,ssplit,pos,lemma,ner'},
        output_types=['ssplit', 'ner'],
        corenlp_jars=[config.STANFORD_CORENLP_DIR])

    return corenlp
예제 #6
0
def get_test_references():
    de, en = [], []
    proc = CoreNLP('ssplit')

    # Insert test references in training data
    entries = Entry.objects(set='test')
    for entry in entries:
        for triple in entry.triples:
            agent = triple.agent.name
            patient = triple.patient.name

            de.append(agent)
            name = ' '.join(
                agent.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())

            de.append(patient)
            name = ' '.join(
                patient.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())
    return de, en
예제 #7
0
def main():
    print 'Initializing...'
    proc = CoreNLP("coref")
    verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb(
        'data/morph-verbalization-v1.01.txt')
    sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt')
    aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word,
                      proc)

    corpora = ['LDC2015E86', 'LDC2016E25']
    dir = 'data/LDC2016E25/data/alignments/split'

    print 'Processing...'
    train_set, dev_set, test_set = [], [], []

    train, dev, test = run(dir, aligner)

    train_set.extend(train)
    dev_set.extend(dev)
    test_set.extend(test)

    print 'Writing...'
    write('data/alignments/training', train_set)
    write('data/alignments/dev', dev_set)
    write('data/alignments/test', test_set)
 def lemmaMapper(itr):
     pipeline = CoreNLP(
         configdict={'annotators': "tokenize,ssplit,pos,lemma"},
         corenlp_jars=["./stanford-corenlp-full-2015-04-20/*"])
     return map(
         lambda tc: (tc[0], plainTextToLemmas(tc[1], stopWords, pipeline)),
         itr)
예제 #9
0
    def __init__(self, analysisType):
        self.analysisType = analysisType

        coreNLPPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLP.jar')
        coreNLPModelsPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLPModels.jar')
        if StanfordCoreNLP.proc == None:
            StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref'}, corenlp_jars=[coreNLPPath, coreNLPModelsPath])
예제 #10
0
    def run(self, fin, fout):
        self.proc = CoreNLP('ssplit')

        entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle')))

        f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev'))
        texts = f.read().lower().split('\n')
        f.close()

        print len(texts), len(entity_maps)

        for i, text in enumerate(texts[:-1]):
            entity_map = entity_maps[i]
            for tag in entity_map:
                name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_'))
                texts[i] = texts[i].replace(tag.lower(), str(name))

        f = open(fout, 'w')
        for text in texts:
            out = self.proc.parse_doc(text)['sentences']

            text = []
            for i, snt in enumerate(out):
                text.extend(snt['tokens'])
            text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip()

            f.write(text.encode('utf-8'))
            f.write('\n')
        f.close()
예제 #11
0
    def __init__(self, fdev, ftest):
        self.proc = CoreNLP('ssplit')

        self.get_results(fdev, ftest)

        # DEV
        dev_order, dev_gold = [], []
        DEV_DIR = u'../data/dev'
        for dir in os.listdir(DEV_DIR):
            if dir != u'.DS_Store':
                f = os.path.join(DEV_DIR, dir)
                for fname in os.listdir(f):
                    if fname != u'.DS_Store':
                        print os.path.join(f, fname)
                        _order, _gold = self.order(os.path.join(f, fname), u'dev')
                        dev_order.extend(_order)
                        dev_gold.extend(_gold)
        self.write_hyps(dev_order, fdev + '.ordered')

        utils.write_references('results/gold/dev.en', dev_gold)

        # TEST
        test_order, test_gold = [], []
        TEST_FILE = u'../data/test/triples/test.xml'
        _order, _gold = self.order(TEST_FILE, u'test')
        test_order.extend(_order)
        self.write_hyps(test_order, ftest + '.ordered')

        # save previous orders
        self.save_prev_order()
 def __init__(self, homedir='./'):
     from stanford_corenlp_pywrapper import CoreNLP
     self.corenlp = CoreNLP(
         configdict={
             'annotators': 'tokenize, ssplit, pos, lemma, parse, ner'
         },
         output_types=['pos', 'lemma', 'parse', 'ner'],
         corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])
예제 #13
0
def start_corenlp():
    proc = CoreNLP("pos",
                   corenlp_jars=[
                       osp.join(this_dir,
                                "3rdparty/stanford-corenlp-full-2015-04-20/*")
                   ],
                   comm_mode='SOCKET')
    return proc
예제 #14
0
    def __init__(self):
        proc = CoreNLP("coref")
        verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb(
            'data/morph-verbalization-v1.01.txt')
        sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt')

        self.aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb,
                               sub2word, proc)
예제 #15
0
    def __init__(self, analysisType):
        self.analysisType = analysisType

#        print("ANALYSIS: " + str(analysisType))

        if StanfordCoreNLP.proc == None:
            StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, sentiment, dcoref, relation, natlog, openie'},
            corenlp_jars=[os.path.join(os.path.dirname(__file__), '../../lib/*')]) #, comm_mode='PIPE')
예제 #16
0
 def __parse_text(self):
     if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)):
         self.__load_parse_result()
         return
     ss = CoreNLP(
         'parse',
         corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*'])
     self.parsed = ss.parse_doc(self.sentences)
     ss.cleanup()
예제 #17
0
 def __init__(self):
     # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"])
     self.proc = CoreNLP(
         configdict={
             'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'
         },
         corenlp_jars=[
             "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*",
             "/Users/akira/stanford-corenlp-full-2015-12-09/sutime"
         ])
예제 #18
0
def entity_ner():
    '''
    Named entity types of the entities
    :return:
    '''
    def get_stats(dataset, setname):
        stats = []
        for text, refex in dataset:
            refex_tokens = refex.split()
            out = proc.parse_doc(text)

            tokens, ners = [], []
            for snt in out['sentences']:
                tokens.extend(snt['tokens'])
                ners.extend(snt['ner'])

            for i, token in enumerate(tokens):
                found = True
                if refex_tokens[0] == token:
                    for j, refex_token in enumerate(refex_tokens):
                        if refex_token != tokens[i + j]:
                            found = False
                            break

                    if found:
                        ner = ners[i]
                        stats.append(ner)
                        break

        print setname
        freq = dict(nltk.FreqDist(stats))
        total = sum(freq.values())
        for name, freq in freq.iteritems():
            print name, freq, float(freq) / total
        print 10 * '-'

    proc = CoreNLP('ner')

    train_data = p.load(open(TRAIN_REFEX_FILE))
    dev_data = p.load(open(DEV_REFEX_FILE))
    test_data = p.load(open(TEST_REFEX_FILE))

    train_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()),
        train_data)
    dev_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()), dev_data)
    test_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()),
        test_data)

    get_stats(train_refex, 'TRAIN')
    get_stats(dev_refex, 'DEV')
    get_stats(test_refex, 'TEST')
예제 #19
0
    def __init__(self):
        #self.server = ServerProxy(JsonRpc20(),
        #                         TransportTcpIp(addr=("127.0.0.1", 8080)))
        corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*"

        self.server = CoreNLP(configdict={
            'annotators':
            'tokenize,ssplit,pos,depparse,lemma,ner',
            'depparse.model':
            'edu/stanford/nlp/models/parser/nndep/english_SD.gz'
        },
                              corenlp_jars=[corenlp_dir])
예제 #20
0
    def __init__(self, fname, _set='train'):
        self.proc = CoreNLP('parse')
        self._set = _set

        f = open(fname)
        doc = f.read()
        f.close()

        doc = doc.split((50 * '*') + '\n')

        print 'Doc size: ', len(doc)

        for entry in doc:
            entry = entry.split('\n\n')

            _, entryId, size, semcategory = entry[0].replace('\n', '').split()

            entity_map = dict(
                map(lambda entity: entity.split(' | '),
                    entry[2].replace('\nENTITY MAP\n', '').split('\n')))

            lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1]

            for lex in lexEntries:
                if lex[0] == '\n':
                    lex = lex[1:]
                lex = lex.split('\n')

                lexId = lex[0]
                text = lex[1].replace('TEXT: ', '').strip()
                template = lex[2].replace('TEMPLATE: ', '')
                correct = lex[3].replace('CORRECT: ', '').strip()
                comment = lex[4].replace('COMMENT: ', '').strip()

                if comment in ['g', 'good']:
                    print template
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, template)
                    references = self.process_references(
                        text, template, entity_map)
                    self.save_references(references)
                elif correct != '' and comment != 'wrong':
                    print correct
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, correct)
                    references = self.process_references(
                        text, correct, entity_map)
                    self.save_references(references)
예제 #21
0
def CoreNLP_tokenizer():
    proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'},
                   corenlp_jars=[path.join(CoreNLP_path(), '*')])

    def tokenize_context(context):
        parsed = proc.parse_doc(context)
        tokens = []
        char_offsets = []
        for sentence in parsed['sentences']:
            tokens += sentence['tokens']
            char_offsets += sentence['char_offsets']

        return tokens, char_offsets

    return tokenize_context
예제 #22
0
def write_hyps(hyps, fname):
    proc = CoreNLP('ssplit')

    f = open(fname, 'w')
    for hyp in hyps:
        out = proc.parse_doc(hyp)
        text = ''
        for snt in out['sentences']:
            text += ' '.join(snt['tokens']).replace('-LRB-',
                                                    '(').replace('-RRB-', ')')
            text += ' '

        f.write(text.encode('utf-8'))
        f.write('\n')
    f.close()
예제 #23
0
def lemmatize(l):
    result = []

    from stanford_corenlp_pywrapper import CoreNLP
    proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')

    for doc_words in l:
        single_dict = proc.parse_doc(doc_words)
        row = []
        for each_dict in single_dict['sentences']:
            for word in each_dict['lemmas']:
                row.append(word)
        result.append(row)

    return result
예제 #24
0
def CoreNLP_tokenizer():
    proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'},
                   corenlp_jars=[path.join(CoreNLP_path, '*')])

    def tokenize_with_offset(context):
        parsed = proc.parse_doc(context)
        return [(sentence['tokens'], sentence['char_offsets'][0][0],
                 sentence['char_offsets'][-1][-1])
                for sentence in parsed['sentences']]

    def tokenize(sentence):
        parsed = proc.parse_doc(sentence)
        tokens = []
        for sentence in parsed['sentences']:
            tokens += sentence['tokens']
        return tokens

    return tokenize_with_offset, tokenize
예제 #25
0
def split_and_tokenize(doc):
    '''
    Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP.
    More info: https://github.com/brendano/stanford_corenlp_pywrapper
    :param doc: path to the
    :return:
    '''
    parse_mode = "ssplit"  # tokenization and sentence splitting
    coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*"

    parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath])

    json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json"
    if not os.path.exists(json_name):
        doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0]
        document = codecs.open(doc_path, "r", encoding="utf-8").read()
        data_source_parse = parser.parse_doc(document)

        with open(json_name, 'w') as fp:
            json.dump(data_source_parse, fp, sort_keys=True, indent=2)
예제 #26
0
def main():
    if not os.path.exists(IN_FILE + '_rf'):
        print('First reformatting file...')
        out_format = open(IN_FILE + '_rf', 'w')
        with open(IN_FILE) as handle:
            for line in tqdm(handle):
                tline = line.strip()
                if tline == '':
                    out_format.write('\n')
                else:
                    out_format.write(tline + ' ')

    print('Sentence tokenizer!')
    print('Loading Stanford CoreNLP...')
    proc = CoreNLP(configdict={
        'annotators': 'tokenize,ssplit',
        'tokenize.options': 'ptb3Escaping=False'
    },
                   output_types=['tokenize,ssplit'],
                   corenlp_jars=[CORENLP_PATH])

    out_file = open(IN_FILE + '_sts', 'w')
    sentence_count = 0

    print('Opening file ' + IN_FILE + '_rf' + '...')
    with open(IN_FILE + '_rf') as handle:
        lines = handle.readlines()
        for line in tqdm(lines):
            the_text = line.strip()
            # Use Stanford instead
            parsed = proc.parse_doc(the_text)

            sentence_count += len(parsed['sentences'])
            for sent in parsed['sentences']:
                the_tokens = [i.replace(' ', '') for i in sent['tokens']]
                the_sent = ' '.join(the_tokens)
                assert len(the_sent.split(' ')) == len(sent['tokens'])
                out_file.write(the_sent.encode('utf-8') + '\n')
    print('Number of sentences so far: ' + '{:,}'.format(sentence_count))

    out_file.close()
    # CoreNLP
    # coreNlpPath="/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-srparser-2014-10-23-models.jar"
    # #coreNlpPath="/home/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;"
    #
    # # #server
    # # coreNlpPath="/home/mitarb/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*"
    # # coreNlpPath="/home/mitarb/mihaylov/research/TAC2016/tac2016-kbp-event-nuggets/corenlp/stanford-corenlp-full-2015-12-09/*;"

    coreNlpPath = "/Users/mihaylov/research/libs/corenlp_executables/stanford-corenlp-full-2015-12-09/*"
    if len(sys.argv) > 3:
        coreNlpPath = sys.argv[3]

    print "coreNlpPath:%s" % coreNlpPath

    parse_mode = "pos"  # "pos", "parse"
    parser = CoreNLP(parse_mode, corenlp_jars=coreNlpPath.split(';'))

    print("Processing %s input files.." % len(input_files_in_dir))
    for fid, file_name in enumerate(input_files_in_dir):
        print "-" * 10
        print "--- " + file_name + " ---"
        print "-" * 10
        try:
            file_base_name = get_file_base_name(file_name)
            print("File %s of %s:%s" %
                  (fid + 1, len(input_files_in_dir), file_name))
            output_dir_file = os.path.join(output_dir,
                                           file_base_name + "_prep")

            if not os.path.exists(output_dir_file):
                os.makedirs(output_dir_file)
예제 #28
0
import sys

from settings import *
from stanford_corenlp_pywrapper import CoreNLP

with open('EECS_annotated_samples_anonymized') as handle:
    lines = handle.readlines()
    lines = [line.strip() for line in lines]

proc = CoreNLP('pos', corenlp_jars=[PATH_TO_STANFORD_CORENLP])

out_file = open('crf-input-data', 'w')
cur_line, cur_parsed, cur_mapped, cur_pos = [None] * 4
current_nonos = 0
in_annotations = False
in_type = None

for line in lines:
    if line == '':
        current_nonos += sum([1 for tok in cur_mapped if tok != 'O'])
        #print('Non-Os is now: ' + str(current_nonos))
        #print(cur_mapped)
        #print('\n\n\n')
        assert len(cur_mapped) == len(cur_pos) and len(cur_pos) == len(
            cur_parsed)
        for i in range(0, len(cur_mapped)):
            out_file.write(cur_parsed[i] + '\t' + cur_pos[i] + '\t' +
                           cur_mapped[i] + '\n')
        out_file.write('\n')
        in_annotations = False
        continue
예제 #29
0
# stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords
# from nltk includes "i"
stop = set(stopwords.words('english'))
stop.add("I")

# regex: only keep words composed of alphanumeric characters or alphanumeric or ! or ?
# words joined by "-" (e.g. keep data-driven)
# ignore parenthesis -rrb-, -lrb- so use match instead of search
# match: Determine if the RE matches at the beginning of the string.
pattern = re.compile(r'^(?:[A-Za-z0-9]+[- ][A-za-z0-9]+|[A-Za-z0-9]+|[?!]+)$')
#pattern_parenthesis = re.compile("-rrb-|-lrb-")

proc = CoreNLP(
    "pos",
    corenlp_jars=[
        "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"
    ])

# You can also specify the annotators directly. For example, say we want to
# parse but don't want lemmas. This can be done with the configdict option:
# no longer need to specify output_types (the outputs to include are inferred from the annotators setting
p = CoreNLP(
    configdict={
        'annotators':
        'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref'
    },
    #output_types=['pos','parse'],
    corenlp_jars=[
        "/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"
    ])
예제 #30
0
# -*- coding: utf-8 -*-

import SocketServer

from stanford_corenlp_pywrapper import CoreNLP


class MyTCPHandler(SocketServer.BaseRequestHandler):
    def handle(self):
        self.data = self.request.recv(1024).strip()
        print self.data
        if not isinstance(self.data, unicode):
            document = unicode(self.data, 'utf-8')
        jdoc = ss.parse_doc(document, raw=True)
        self.request.sendall(jdoc)


if __name__ == "__main__":
    HOST, PORT = "localhost", 9998
    # Enter FULL path to folder containing extracted Stanford Core NLP
    ss = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, parse'},
                 corenlp_jars=["stanford-corenlp-full-2015-01-29/*"])
    print "model loaded"
    server = SocketServer.TCPServer((HOST, PORT), MyTCPHandler)
    server.serve_forever()