예제 #1
0
파일: clean.py 프로젝트: nitin7/WordBreak
def main(arg):
    dir = os.path.dirname(__file__)
    filename = os.path.join(dir, 'stanford-corenlp-python/stanford-corenlp-full-2014-08-27/*')
    configFileLoc = os.path.join(dir, 'config.ini')
    proc = CoreNLP(configfile=configFileLoc, corenlp_jars=[filename])
    with open(arg, "r") as file:
        data = removeHeadings(file)
        parsed = proc.parse_doc(data)
        data = []
        for s in parsed[u'sentences']:
            sent = str(' '.join(s[u'tokens']))
            data.append(sent.translate(string.maketrans("",""), string.punctuation))

        data1 = ".".join(data)
        data1 = data1.replace("..",".")
        data1 = data1.replace("  "," ")
        data1 = data1.replace(" .",". ")
        data2 = " ".join(data)
        data2 = data2.replace("  "," ")
        file_train1 = open("data/a1_train1.txt", "w")
        file_train1.write(data1)
        file_train1.close()
        
        file_train2 = open("data/a1_train2.txt", "w")
        file_train2.write(data2)
        file_train2.close()
        
        file_test1 = open("data/a1_test1.txt", "w")
        file_test1.write(clean1(data1))
        file_test1.close()

        file_test2 = open("data/a1_test2.txt", "w")
        file_test2.write(clean(data2))
        file_test2.close()
 def __parse_text(self):
     if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)):
         self.__load_parse_result()
         return
     ss = CoreNLP('parse', corenlp_jars = ['~/software/stanford-corenlp-full-2015-12-09/*'])
     self.parsed = ss.parse_doc(self.sentences)
     ss.cleanup()
예제 #3
0
 def __init__(self):
     self.proc = CoreNLP('parse')
     self.ner = json.load(open('../data/delexicalization/ner_dict.json'))
     self.semcategory = json.load(
         open('../data/delexicalization/delex_dict.json'))
     self.descriptions = json.load(
         open('../data/delexicalization/descriptions.json'))
예제 #4
0
def phrases():
    #STOPWORDS is the list of words we'd like to discards in our 
    stopwords =[".","?","!",',']
    proc = CoreNLP("nerparse",corenlp_jars=[java])
    p=[]
    i=1
    print "####  Traitement et mise en forme des questions extraites  ####"
    with open(quest,'r') as inp:
        for line in inp:
            print "traitement de la ligne " + str(i)
            p.append(proc.parse_doc(line))
            i+=1
    with open('./output/phrases.txt','w') as outp:
        with open('./output/ressources1.txt','w') as outr:
            for elmt in p:
                for tok in elmt["sentences"][0]["lemmas"]:
                    if not tok in stopwords: 
                        a =tok
                        print a 
                        outr.write(a+'\n'.decode().encode('utf-8'))
                        outr.write('\n'.decode().encode('utf-8'))
                for tok in elmt["sentences"][0]["tokens"]:
                    if not tok in stopwords: 
                        outp.write(tok.decode().encode('utf-8')+'\n'.decode().encode('utf-8'))
                        outp.write('\n'.decode().encode('utf-8'))
예제 #5
0
def get_test_references():
    de, en = [], []
    proc = CoreNLP('ssplit')

    # Insert test references in training data
    entries = Entry.objects(set='test')
    for entry in entries:
        for triple in entry.triples:
            agent = triple.agent.name
            patient = triple.patient.name

            de.append(agent)
            name = ' '.join(
                agent.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())

            de.append(patient)
            name = ' '.join(
                patient.replace('\'', '').replace('\"', '').split('_'))
            out = proc.parse_doc(name)
            text = ''
            for snt in out['sentences']:
                text += ' '.join(snt['tokens']).replace('-LRB-', '(').replace(
                    '-RRB-', ')')
                text += ' '
            en.append(text.strip())
    return de, en
예제 #6
0
    def run(self, fin, fout):
        self.proc = CoreNLP('ssplit')

        entity_maps = p.load(open(os.path.join(fin, 'eval1.cPickle')))

        f = open(os.path.join(fin, 'eval1.bpe.de.output.postprocessed.dev'))
        texts = f.read().lower().split('\n')
        f.close()

        print len(texts), len(entity_maps)

        for i, text in enumerate(texts[:-1]):
            entity_map = entity_maps[i]
            for tag in entity_map:
                name = ' '.join(entity_map[tag].name.lower().replace('\'', '').replace('\"', '').split('_'))
                texts[i] = texts[i].replace(tag.lower(), str(name))

        f = open(fout, 'w')
        for text in texts:
            out = self.proc.parse_doc(text)['sentences']

            text = []
            for i, snt in enumerate(out):
                text.extend(snt['tokens'])
            text = ' '.join(text).replace('-LRB- ', '(').replace(' -RRB-', ')').strip()

            f.write(text.encode('utf-8'))
            f.write('\n')
        f.close()
예제 #7
0
    def __init__(self, fdev, ftest):
        self.proc = CoreNLP('ssplit')

        self.get_results(fdev, ftest)

        # DEV
        dev_order, dev_gold = [], []
        DEV_DIR = u'../data/dev'
        for dir in os.listdir(DEV_DIR):
            if dir != u'.DS_Store':
                f = os.path.join(DEV_DIR, dir)
                for fname in os.listdir(f):
                    if fname != u'.DS_Store':
                        print os.path.join(f, fname)
                        _order, _gold = self.order(os.path.join(f, fname), u'dev')
                        dev_order.extend(_order)
                        dev_gold.extend(_gold)
        self.write_hyps(dev_order, fdev + '.ordered')

        utils.write_references('results/gold/dev.en', dev_gold)

        # TEST
        test_order, test_gold = [], []
        TEST_FILE = u'../data/test/triples/test.xml'
        _order, _gold = self.order(TEST_FILE, u'test')
        test_order.extend(_order)
        self.write_hyps(test_order, ftest + '.ordered')

        # save previous orders
        self.save_prev_order()
 def __init__(self, homedir='./'):
     from stanford_corenlp_pywrapper import CoreNLP
     self.corenlp = CoreNLP(
         configdict={
             'annotators': 'tokenize, ssplit, pos, lemma, parse, ner'
         },
         output_types=['pos', 'lemma', 'parse', 'ner'],
         corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])
예제 #9
0
    def __init__(self, in_train, in_dev):
        self.proc = CoreNLP('ssplit')
        self.parser = CoreNLP('parse')
        self.in_train = in_train
        self.in_dev = in_dev

        self.text_id = 0
        self.trainset()
        self.testset()
예제 #10
0
 def __parse_text(self):
     if exists_in_s3('{}/{}'.format(s3_output_prefix, self.outfilename)):
         self.__load_parse_result()
         return
     ss = CoreNLP(
         'parse',
         corenlp_jars=['~/software/stanford-corenlp-full-2015-12-09/*'])
     self.parsed = ss.parse_doc(self.sentences)
     ss.cleanup()
예제 #11
0
 def __init__(self):
     # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"])
     self.proc = CoreNLP(
         configdict={
             'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'
         },
         corenlp_jars=[
             "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*",
             "/Users/akira/stanford-corenlp-full-2015-12-09/sutime"
         ])
예제 #12
0
    def __init__(self, _set='train', save_references=True):
        self._set = _set
        self.proc = CoreNLP('coref')

        self.proc_parse = CoreNLP('parse')

        self.e2f = utils.get_e2f('../data/lex.e2f')

        self.save_references = save_references
        # referring expressions per entity
        self.refexes = {}
예제 #13
0
    def __init__(self):
        #self.server = ServerProxy(JsonRpc20(),
        #                         TransportTcpIp(addr=("127.0.0.1", 8080)))
        corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/*"

        self.server = CoreNLP(configdict={
            'annotators':
            'tokenize,ssplit,pos,depparse,lemma,ner',
            'depparse.model':
            'edu/stanford/nlp/models/parser/nndep/english_SD.gz'
        },
                              corenlp_jars=[corenlp_dir])
예제 #14
0
    def __init__(self, in_train, in_dev, out_vocab, out_train, out_dev, out_test):
        self.proc = CoreNLP('ssplit')
        self.parser = CoreNLP('parse')
        self.in_train = in_train
        self.in_dev = in_dev

        self.out_vocab = out_vocab
        self.out_train = out_train
        self.out_dev = out_dev
        self.out_test = out_test

        self.text_id = 0
        self.trainset()
        self.testset()
예제 #15
0
    def __init__(self, fname, _set='train'):
        self.proc = CoreNLP('parse')
        self._set = _set

        f = open(fname)
        doc = f.read()
        f.close()

        doc = doc.split((50 * '*') + '\n')

        print 'Doc size: ', len(doc)

        for entry in doc:
            entry = entry.split('\n\n')

            _, entryId, size, semcategory = entry[0].replace('\n', '').split()

            entity_map = dict(
                map(lambda entity: entity.split(' | '),
                    entry[2].replace('\nENTITY MAP\n', '').split('\n')))

            lexEntries = entry[3].replace('\nLEX\n', '').split('\n-')[:-1]

            for lex in lexEntries:
                if lex[0] == '\n':
                    lex = lex[1:]
                lex = lex.split('\n')

                lexId = lex[0]
                text = lex[1].replace('TEXT: ', '').strip()
                template = lex[2].replace('TEMPLATE: ', '')
                correct = lex[3].replace('CORRECT: ', '').strip()
                comment = lex[4].replace('COMMENT: ', '').strip()

                if comment in ['g', 'good']:
                    print template
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, template)
                    references = self.process_references(
                        text, template, entity_map)
                    self.save_references(references)
                elif correct != '' and comment != 'wrong':
                    print correct
                    print 10 * '-'
                    self.update_template(entryId, size, semcategory, _set,
                                         lexId, correct)
                    references = self.process_references(
                        text, correct, entity_map)
                    self.save_references(references)
예제 #16
0
def lemmatize(l):
    result = []

    from stanford_corenlp_pywrapper import CoreNLP
    proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')

    for doc_words in l:
        single_dict = proc.parse_doc(doc_words)
        row = []
        for each_dict in single_dict['sentences']:
            for word in each_dict['lemmas']:
                row.append(word)
        result.append(row)

    return result
예제 #17
0
def write_hyps(hyps, fname):
    proc = CoreNLP('ssplit')

    f = open(fname, 'w')
    for hyp in hyps:
        out = proc.parse_doc(hyp)
        text = ''
        for snt in out['sentences']:
            text += ' '.join(snt['tokens']).replace('-LRB-',
                                                    '(').replace('-RRB-', ')')
            text += ' '

        f.write(text.encode('utf-8'))
        f.write('\n')
    f.close()
예제 #18
0
 def lemmaMapper(itr):
     pipeline = CoreNLP(
         configdict={'annotators': "tokenize,ssplit,pos,lemma"},
         corenlp_jars=["./stanford-corenlp-full-2015-04-20/*"])
     return map(
         lambda tc: (tc[0], plainTextToLemmas(tc[1], stopWords, pipeline)),
         itr)
예제 #19
0
def main():
    print 'Initializing...'
    proc = CoreNLP("coref")
    verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb(
        'data/morph-verbalization-v1.01.txt')
    sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt')
    aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb, sub2word,
                      proc)

    corpora = ['LDC2015E86', 'LDC2016E25']
    dir = 'data/LDC2016E25/data/alignments/split'

    print 'Processing...'
    train_set, dev_set, test_set = [], [], []

    train, dev, test = run(dir, aligner)

    train_set.extend(train)
    dev_set.extend(dev)
    test_set.extend(test)

    print 'Writing...'
    write('data/alignments/training', train_set)
    write('data/alignments/dev', dev_set)
    write('data/alignments/test', test_set)
예제 #20
0
파일: parser.py 프로젝트: GauravG8/snowball
def get_parser():
    corenlp = CoreNLP(
        configdict={'annotators': 'tokenize,ssplit,pos,lemma,ner'},
        output_types=['ssplit', 'ner'],
        corenlp_jars=[config.STANFORD_CORENLP_DIR])

    return corenlp
예제 #21
0
    def __init__(self, analysisType):
        self.analysisType = analysisType

        coreNLPPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLP.jar')
        coreNLPModelsPath = os.path.join(os.path.dirname(__file__), '../../lib/stanfordCoreNLPModels.jar')
        if StanfordCoreNLP.proc == None:
            StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref'}, corenlp_jars=[coreNLPPath, coreNLPModelsPath])
예제 #22
0
def start_corenlp():
    proc = CoreNLP("pos",
                   corenlp_jars=[
                       osp.join(this_dir,
                                "3rdparty/stanford-corenlp-full-2015-04-20/*")
                   ],
                   comm_mode='SOCKET')
    return proc
예제 #23
0
    def __init__(self, analysisType):
        self.analysisType = analysisType

#        print("ANALYSIS: " + str(analysisType))

        if StanfordCoreNLP.proc == None:
            StanfordCoreNLP.proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, lemma, ner, parse, sentiment, dcoref, relation, natlog, openie'},
            corenlp_jars=[os.path.join(os.path.dirname(__file__), '../../lib/*')]) #, comm_mode='PIPE')
예제 #24
0
    def __init__(self):
        proc = CoreNLP("coref")
        verb2noun, noun2verb, verb2actor, actor2verb = utils.noun_verb(
            'data/morph-verbalization-v1.01.txt')
        sub2word = utils.subgraph_word('data/verbalization-list-v1.06.txt')

        self.aligner = Aligner(verb2noun, noun2verb, verb2actor, actor2verb,
                               sub2word, proc)
예제 #25
0
class StanfordNLP:
    def __init__(self):
        #self.server = ServerProxy(JsonRpc20(),
        #                         TransportTcpIp(addr=("127.0.0.1", 8080)))
        corenlp_dir="/usr/local/lib/stanford-corenlp-full-2017-06-09/*"

        self.server = CoreNLP(configdict={'annotators': 'tokenize,ssplit,pos,depparse,lemma,ner','depparse.model':'edu/stanford/nlp/models/parser/nndep/english_SD.gz'}, corenlp_jars=[corenlp_dir])

    def parse(self, text):
        return self.server.parse_doc(text)
예제 #26
0
파일: ner.py 프로젝트: edvisees/EDL2015
 def start_server(self):
     self.corenlp = CoreNLP(
         corenlp_jars=[
             os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]),
             os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]),
         ],
         server_port=self.config[self.lang]["port"],
         configdict=self.config[self.lang]["properties"],
     )
     print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"])
예제 #27
0
def entity_ner():
    '''
    Named entity types of the entities
    :return:
    '''
    def get_stats(dataset, setname):
        stats = []
        for text, refex in dataset:
            refex_tokens = refex.split()
            out = proc.parse_doc(text)

            tokens, ners = [], []
            for snt in out['sentences']:
                tokens.extend(snt['tokens'])
                ners.extend(snt['ner'])

            for i, token in enumerate(tokens):
                found = True
                if refex_tokens[0] == token:
                    for j, refex_token in enumerate(refex_tokens):
                        if refex_token != tokens[i + j]:
                            found = False
                            break

                    if found:
                        ner = ners[i]
                        stats.append(ner)
                        break

        print setname
        freq = dict(nltk.FreqDist(stats))
        total = sum(freq.values())
        for name, freq in freq.iteritems():
            print name, freq, float(freq) / total
        print 10 * '-'

    proc = CoreNLP('ner')

    train_data = p.load(open(TRAIN_REFEX_FILE))
    dev_data = p.load(open(DEV_REFEX_FILE))
    test_data = p.load(open(TEST_REFEX_FILE))

    train_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()),
        train_data)
    dev_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()), dev_data)
    test_refex = map(
        lambda x: (x['text'], x['refex'].replace('eos', '').strip()),
        test_data)

    get_stats(train_refex, 'TRAIN')
    get_stats(dev_refex, 'DEV')
    get_stats(test_refex, 'TEST')
예제 #28
0
def split_and_tokenize(doc):
    '''
    Reads a text document, splits sentences and tokenize them with the python wrapper of the Stanford CoreNLP.
    More info: https://github.com/brendano/stanford_corenlp_pywrapper
    :param doc: path to the
    :return:
    '''
    parse_mode = "ssplit"  # tokenization and sentence splitting
    coreNlpPath = "/Users/ana/workspace/stanford_corenlp_pywrapper/stanford-corenlp-full-2017-06-09/*"

    parser = CoreNLP(parse_mode, corenlp_jars=[coreNlpPath])

    json_name = "database.mpqa.2.0/docs/" + doc.split("\n")[0] + ".json"
    if not os.path.exists(json_name):
        doc_path = "database.mpqa.2.0/docs/" + doc.split("\n")[0]
        document = codecs.open(doc_path, "r", encoding="utf-8").read()
        data_source_parse = parser.parse_doc(document)

        with open(json_name, 'w') as fp:
            json.dump(data_source_parse, fp, sort_keys=True, indent=2)
예제 #29
0
def main():
    if not os.path.exists(IN_FILE + '_rf'):
        print('First reformatting file...')
        out_format = open(IN_FILE + '_rf', 'w')
        with open(IN_FILE) as handle:
            for line in tqdm(handle):
                tline = line.strip()
                if tline == '':
                    out_format.write('\n')
                else:
                    out_format.write(tline + ' ')

    print('Sentence tokenizer!')
    print('Loading Stanford CoreNLP...')
    proc = CoreNLP(configdict={
        'annotators': 'tokenize,ssplit',
        'tokenize.options': 'ptb3Escaping=False'
    },
                   output_types=['tokenize,ssplit'],
                   corenlp_jars=[CORENLP_PATH])

    out_file = open(IN_FILE + '_sts', 'w')
    sentence_count = 0

    print('Opening file ' + IN_FILE + '_rf' + '...')
    with open(IN_FILE + '_rf') as handle:
        lines = handle.readlines()
        for line in tqdm(lines):
            the_text = line.strip()
            # Use Stanford instead
            parsed = proc.parse_doc(the_text)

            sentence_count += len(parsed['sentences'])
            for sent in parsed['sentences']:
                the_tokens = [i.replace(' ', '') for i in sent['tokens']]
                the_sent = ' '.join(the_tokens)
                assert len(the_sent.split(' ')) == len(sent['tokens'])
                out_file.write(the_sent.encode('utf-8') + '\n')
    print('Number of sentences so far: ' + '{:,}'.format(sentence_count))

    out_file.close()
class StanfordPreprocessor(object):
    def __init__(self, homedir='./'):
        from stanford_corenlp_pywrapper import CoreNLP
        self.corenlp = CoreNLP(
            configdict={
                'annotators': 'tokenize, ssplit, pos, lemma, parse, ner'
            },
            output_types=['pos', 'lemma', 'parse', 'ner'],
            corenlp_jars=[homedir + "lib/stanford-corenlp-full-2015-04-20/*"])

    def parse(self, document):
        return self.corenlp.parse_doc(document)
예제 #31
0
class SentenceDelimiter():
    def __init__(self, corenlp_path):
        self.proc = CoreNLP("ssplit", corenlp_jars=[os.path.join(corenlp_path, '*')])

    def get_sentences(self, text):
        res = self.proc.parse_doc(text)
        for sentence in res['sentences']:
            sentence_text = ' '.join(sentence['tokens']).encode('utf8')
            sentence_text = ' '.join(sentence_text.split())
            sentence_text = sentence_text.replace('-LRB-', '(').replace('-RRB-', ')')
            sentence_text = sentence_text.replace('-LSB-', '[').replace('-RSB-', ']')
            sentence_text = sentence_text.replace('-LCB-', '{').replace('-RCB-', '}')
            yield escape(sentence_text)
예제 #32
0
def CoreNLP_tokenizer():
    proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'},
                   corenlp_jars=[path.join(CoreNLP_path(), '*')])

    def tokenize_context(context):
        parsed = proc.parse_doc(context)
        tokens = []
        char_offsets = []
        for sentence in parsed['sentences']:
            tokens += sentence['tokens']
            char_offsets += sentence['char_offsets']

        return tokens, char_offsets

    return tokenize_context
예제 #33
0
class BioNLPEnrichment(BaseEnrichment):
    """
    """
    def __init__(self):
        """
        Load and initialize any external models or data here
        """
        self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"])
    def enrichment_value(self,tweet):
        """ Calculate enrichment value """
        rep = self.corenlp.parse_doc(tweet["actor"]["summary"])
        return rep

    def __repr__(self):
        """ Add a description of the class's function here """
        return("Stanford core NLP applied to user bio")
예제 #34
0
파일: ner.py 프로젝트: Marsan-Ma/tnative
  def __init__(self, lang='en', en_ner=False):
    # feature parameters
    self.lang = lang

    # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.)
    if en_ner == 'nltk':
      self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE']
      self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20'
      self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz'
      self.sner_main = self.sner_root+'/stanford-ner.jar'
      self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8')

    # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy)
    if en_ner == 'corenlp':
      self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE']
      self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path])
예제 #35
0
class kawata_corenlp_handler:
    def __init__(self):
        # self.proc = CoreNLP(configdict={'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'}, corenlp_jars=["/usr/local/lib/stanford-corenlp-full-2015-12-09/*"])
        self.proc = CoreNLP(
            configdict={
                'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse'
            },
            corenlp_jars=[
                "/CORENLPDIRECTORY/stanford-corenlp-full-2015-12-09/*",
                "/Users/akira/stanford-corenlp-full-2015-12-09/sutime"
            ])

    def __join_text_date(self, text, date):
        '''
        Join text and date.
        '''
        date_s = dt.date2str(date)
        return '[<date>{0}</date>]\n{1}'.format(date_s, text)

    def get_words(self, text, date):
        n_text = unidecode.unidecode(text)
        joint_text = self.__join_text_date(n_text, date)
        joint_text = n_text

        p = self.proc.parse_doc(joint_text)["sentences"][0]
        # print p
        words = list()
        words = zip(p["ner"], p["tokens"], p["ner"])
        stop = stopwords.words("english")
        words = filter(lambda x: x[1] not in stop, words)
        words = map(lambda x: (x[0], x[1].lower(), x[2]), words)
        # I cannot understand what is most suitable in above line.
        ws = list()
        w = ("", "", "")
        for v in words:
            if v[0] != 'O' and v[0] == w[0]:
                w = (w[0], w[1] + " " + v[1], w[2])
            else:
                ws.append(w)
                w = v
        if w[0] != "":
            ws.append(w)
        words = ws

        return words[1:]
예제 #36
0
def CoreNLP_tokenizer():
    proc = CoreNLP(configdict={'annotators': 'tokenize,ssplit'},
                   corenlp_jars=[path.join(CoreNLP_path, '*')])

    def tokenize_with_offset(context):
        parsed = proc.parse_doc(context)
        return [(sentence['tokens'], sentence['char_offsets'][0][0],
                 sentence['char_offsets'][-1][-1])
                for sentence in parsed['sentences']]

    def tokenize(sentence):
        parsed = proc.parse_doc(sentence)
        tokens = []
        for sentence in parsed['sentences']:
            tokens += sentence['tokens']
        return tokens

    return tokenize_with_offset, tokenize
예제 #37
0
class BodyNLPEnrichment(BaseEnrichment):
    """
    """

    def __init__(self):
        """
        Load and initialize any external models or data here
        """
        self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"])

    def enrichment_value(self, tweet):
        """ Calculate enrichment value """
        rep = self.corenlp.parse_doc(tweet["body"])
        return rep

    def __repr__(self):
        """ Add a description of the class's function here """
        return "Stanford core NLP applied to tweet body"
예제 #38
0
class StanfordParser(object):
    """
    Stanford parser
    """
    def __init__(self, corenlp_jars):
        self.proc = CoreNLP("parse", corenlp_jars=corenlp_jars)

    def parse(self, text):
        # {u'sentences':
        #     [
        #         {u'parse': u'(ROOT (S (VP (NP (INTJ (UH hello)) (NP (NN world)))) (. !)))'
        #          u'tokens': [u'hello', u'world', u'.'],
        #          u'lemmas': [u'hello', u'world', u'.'],
        #          u'pos': [u'UH', u'NN', u'.'],
        #          u'char_offsets': [[0, 5], [6, 11], [11, 12]]
        #          },
        #         ...
        #     ]
        # }
        json_rst = self.proc.parse_doc(text)
        if json_rst:
            for sent in json_rst['sentences']:
                parse_tree= sent['parse']
                yield parse_tree
예제 #39
0
파일: hw3.py 프로젝트: linshiu/courses
# key: name, value = index number
map_name_index = {name: index for (name,index) in zip(doc_names, range(len(doc_names)))}

# create dictionary key: doc name, value =docment
doc_dic = {name: doc for (name,doc) in zip(doc_names,documents)}
    
# save documents
for name in doc_dic:
    f = open(os.path.join(out_file_bios_folder,name + ".txt"), "w")
    f.write(doc_dic[name])
    f.close()

#%% Text Processing ##########################################################

proc = CoreNLP("pos", corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])

# You can also specify the annotators directly. For example, say we want to 
# parse but don't want lemmas. This can be done with the configdict option:
p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse'}, 
            output_types=['pos','parse'],
            corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])

doc_dic_normalized = {} # key: document name, value = list of lemmas 
# note: remove stopwords, punctuation, numbers, websites, -lrb-, -rrb-

# this pattern is only going to match two cases of words: data, data-driven
# so ignores punctuation, numbers, parenthesis -rrb-, -lrb-, special characters
# ignore so use match instead of search
# match: Determine if the RE matches at the beginning of the string.
# ^ = beginning of string, $ = end of string so https://www.coursera.org is ignored
예제 #40
0
from __future__ import print_function

from stanford_corenlp_pywrapper import CoreNLP
from nltk import *
import os

proc = CoreNLP("parse", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])

#correct subdirectory by coded type goes here
#comment all this to do a single text file instead of a directory
path = 'data/engelhard/A/'
for filename in os.listdir(path):
  print(filename)
  with open(path+filename, 'rU') as f:
    engelhard = f.read()
    engelhard2 = engelhard.decode('utf8', 'ignore')
    trees = proc.parse_doc(engelhard2)
    print(engelhard2)

  #this is set as parse (parsing with named entity recognition) but you can also change it to different options, like:
  #ssplit for tokenization and sentence splitting
  #pos for pos and lemmas
  #ner for pos and ner and lemmas
  #parse for pos, lemmas, trees, dependencies
  #nerparse for parsing with ner, pos, lemmas, dependencies
  #coref for coreference including constituent parsing

  #comment this to do coref
  trees = proc.parse_doc(engelhard2)
  #print(trees)
# Note: Stanford NLP: parentheis are -rrb- and -lrb-

# stopwords list , add "I" since Stanford NLP does not lowercase I but stopwords
# from nltk includes "i"
stop = set(stopwords.words('english'))
stop.add("I")

# regex: only keep words composed of alphanumeric characters or alphanumeric or ! or ?
# words joined by "-" (e.g. keep data-driven)
# ignore parenthesis -rrb-, -lrb- so use match instead of search
# match: Determine if the RE matches at the beginning of the string.
pattern = re.compile(r'^(?:[A-Za-z0-9]+[- ][A-za-z0-9]+|[A-Za-z0-9]+|[?!]+)$')
#pattern_parenthesis = re.compile("-rrb-|-lrb-")


proc = CoreNLP("pos", corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])

# You can also specify the annotators directly. For example, say we want to 
# parse but don't want lemmas. This can be done with the configdict option:
# no longer need to specify output_types (the outputs to include are inferred from the annotators setting
p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner,entitymentions, dcoref'}, 
            #output_types=['pos','parse'],
            corenlp_jars=["/Users/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])
            
            
data_lemmas = copy.deepcopy(data_names) # deep copy otherwise change data_clean since list of objects

# lemmatize quotes and description
for row in data_lemmas:
    # Now it's ready to parse documents. You give it a string and it returns JSON-safe data structures
    # dictionary key = 'sentences', value = list of sentences
예제 #42
0
 def __init__(self):
     """
     Load and initialize any external models or data here
     """
     self.corenlp = CoreNLP("pos", corenlp_jars=["/home/jkolb/stanford-corenlp-full-2015-04-20/*"])
		def __init__(self):
			global CACHEDIR
			CoreNLP.__init__(self, "parse", corenlp_jars=[CORENLP_JARS_DIR + "*"])
예제 #44
0
파일: ner.py 프로젝트: Marsan-Ma/tnative
class ner(object):
  def __init__(self, lang='en', en_ner=False):
    # feature parameters
    self.lang = lang

    # [NLTK wrapper for Stanford CoreNLP] (too slow, results soso.)
    if en_ner == 'nltk':
      self.entity_cols = ['PERSON', 'ORG', 'LOCATION', 'FACILITY', 'GPE']
      self.sner_root = '/home/marsan/workspace/stanford_nlp/stanford-ner-2015-04-20'
      self.sner_classifier = self.sner_root+'/classifiers/english.all.3class.distsim.crf.ser.gz'
      self.sner_main = self.sner_root+'/stanford-ner.jar'
      self.st = NERTagger(self.sner_classifier, self.sner_main, encoding='utf-8')

    # [Stanford CoreNLP pywrapper] (still slow, reaults too noisy)
    if en_ner == 'corenlp':
      self.entity_cols = ['LOCATION', 'TIME', 'PERSON', 'ORGANIZATION', 'MONEY', 'PERCENT', 'DATE']
      self.snlp = CoreNLP("ner", corenlp_jars=["%s/stanford-corenlp-full-2015-04-20/*" % snlp_path])


  #===========================================
  #   Standford CoreNLP pywrapper
  #===========================================
  def get_ner_stanford_corenlp(self, txt):
    tree = self.snlp.parse_doc(txt.upper())
    ners = {n: [] for n in self.entity_cols}
    results = [list(zip(r['ner'], r['tokens'])) for r in tree['sentences']]
    results = [(k[0], k[1].lower()) for v in results for k in v if k[0] in self.entity_cols]
    ners = {k: [] for k in self.entity_cols}
    for k,v in results: ners[k].append(v)
    ners = {k: list(set(v)) for k,v in ners.items()}
    return ners

  # #===========================================
  # #   Standford CoreNLP (slow but better)
  # #===========================================
  def get_ner_tags(self, text):
    ners = {}
    terms = [(k,v) for k,v in self.st.tag(text.split()) if v != 'O']
    for t in self.entity_cols:
      ners[t] = list(set([re.sub('[^0-9a-zA-Z]+', ' ', k.lower()) for k,v in terms if v == t]))
    return ners

  #===========================================
  #   NLTK NER (very bad accuracy, a lot garbage)
  #===========================================
  def get_ner_nltk(self, text):
    sents = nltk.sent_tokenize(text)  # sentences
    tokenized_sents = [nltk.word_tokenize(s) for s in sents]
    tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents]
    chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)]
    raw = self.traverseTree(chunked_sents)
    ners = {}
    for n in self.entity_cols: ners[n] = []
    [ners[k].append(v.lower()) for k,v in raw]
    for n in self.entity_cols: ners[n] = list(set(ners[n]))
    return ners

  def traverseTree(self, tree):
    result = []
    for subtree in tree:
      if type(subtree) == nltk.tree.Tree:
        if subtree.label() in self.entity_cols:
          result += [(subtree.label(), subtree[0][0])]
        else:
          result += (self.traverseTree(subtree))
    return result
예제 #45
0
		elif char == ')' or char == ']':
			right += 1
			continue
		if left == right:
			outputList.append(char)
	output = ''.join(outputList)
	return output


PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."]
monthElement = "january|february|march|april|may|june|july|august|september|october|november|december"
dateElement = "1|2|3|4|5|6|7|8|9|0"
monthPattern = re.compile(monthElement, re.IGNORECASE)
datePattern = re.compile(dateElement, re.IGNORECASE)

procCOR = CoreNLP("coref", corenlp_jars=[jar_path])
readFile = (open(file_path)).read()
filteredFile = bracketProcess(readFile)
dictCOR = procCOR.parse_doc(filteredFile)
entitiesCOR = dictCOR['entities']
sentencesCOR = dictCOR['sentences']


replaceList = []
for i in entitiesCOR:
	mentionList = i['mentions']
	if not len(mentionList) == 1:
		catchList = []
		for j in mentionList:
			item = [j['sentence']]
			item.append(j['tokspan_in_sentence'])
예제 #46
0
파일: stparser.py 프로젝트: dkubo/legalNLP
def main():
	proc = CoreNLP("pos", corenlp_jars=["/home/is/daiki-ku/opt/stanford-corenlp-full-2016-10-31/*"])
	proc.parse_doc("hello world. how are you?")
예제 #47
0
 TODO: Implement differents parameters for the phrase extraction from sentences.
Actual parameters are He,  the usual NER from Stanford CoreNLP and the unigramm model without stopwords """
#mport re
import numpy as np
from stanford_corenlp_pywrapper import CoreNLP

#Loading the Stanford CoreNLP Lib

data = "./extracted-quest/quest-en.txt"
loc= "/people/panou/Stage/projet/stanford-corenlp-full-2015-04-20/*"


#STOPWORDS is the list of words we'd like to discards in our 
stopwords =[".","?","!",',']

proc = CoreNLP("nerparse",corenlp_jars=[loc])
p=[]
i=1
with open(data,'r') as inp:
    for line in inp:
        print "traitement de la ligne " + str(i)
        p.append(proc.parse_doc(line))
        i+=1

with open('./phrases.txt','w') as out:
    for elmt in p:
        #print elmt["sentences"][0]["tokens"]
        for tok in elmt["sentences"][0]["lemmas"]:
            if not tok in stopwords: 
                out.write(tok+'\n')
        out.write('\n')
예제 #48
0
#!/usr/bin/python

# This script will extract a single article (all paragraphs) on the launcher grid.

"""
118238@10\tSen.~^~Barack~^~Obama~^~and~^~his~^~wife~^~,~^~Michelle~^~Obama~^~,~^~have~^~released~^~eight~^~years~^~of~^~joint~^~returns~^~.\tO~^~PERSON~^~PERSON~^~O~^~O~^~O~^~O~^~PERSON~^~PERSON~^~O~^~O~^~O~^~DURATION~^~DURATION~^~O~^~O~^~O~^~O
"""

from stanford_corenlp_pywrapper import CoreNLP
import os
import sys

# Prepare the parser
proc = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse, lemma, ner'},
               output_types=["pos","parse"], 
               corenlp_jars=["/work/02092/vsochat/wrangler/SOFTWARE/stanford-corenlp-full-2015-04-20/*"])

input_file = sys.argv[1]
output_file = sys.argv[2]

# Any errors will have entries written to an error file for inspection
error_file = output_file.replace(".txt",".err")
filey = open(input_file,"rb")
lines = filey.readlines()[0]
filey.close()

# Format expected to be:
# "12345|<text><p>hello this is text, sentence one!</p><p>sentence two!</p></text>"
article_id,text = lines.split("|")
text =  text.replace("</text>","").replace("<text>","").strip("\n").replace('"',"")
paragraphs = text.split("<p>")
예제 #49
0
in_file_name = 'classbios.txt'
split = in_file_name.split(".")
out_file_name_lines = split[0] + "_lines." + split[1]
out_file_name_normalized_line = split[0] + "_normalized_line ." + split[1]
out_file_name_normalized_sentence = split[0] + "_normalized_sentence ." + split[1]
out_file_name_normalized_tokens = split[0] + "_normalized_tokens ." + split[1]

k = 20 # top k unigrams and bigrams

out_file_name_unigrams = "{0}_top{1}{2}.json".format(split[0],k,"Unigrams")
out_file_name_bigrams = "{0}_top{1}{2}.json".format(split[0],k,"Bigrams")

os.chdir(path)
os.listdir('.') # see if file is in directory

proc = CoreNLP("pos", corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])

# You can also specify the annotators directly. For example, say we want to 
# parse but don't want lemmas. This can be done with the configdict option:
p = CoreNLP(configdict={'annotators':'tokenize, ssplit, pos, parse'}, 
            output_types=['pos','parse'],
            corenlp_jars=["/Users/Steven/Documents/corenlp/stanford-corenlp-full-2015-04-20/*"])
            
#%% Functions  ##############################################################


def getFrequency(ls, ignore = set(), pattern = re.compile(r'.') ):
    """Gets the frequency of elements in list, ignoring elements in ignore set
    and matching the pattern
    
    Args:
예제 #50
0
from stanford_corenlp_pywrapper import CoreNLP
import os
proc = CoreNLP("ner", corenlp_jars=["/Users/Jerry/Downloads/stanford-corenlp-full-2015-12-09/*"])
input_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dataset'
output_path = '/Users/Jerry/Documents/CMPS290H/Project/data/dictionary/name.tsv'
#parse files
output = open(output_path,'w')
for filename in os.listdir(input_path):
	try:
		input_file = open(filename,'r')
		x = input_file.read()
		out = proc.parse_doc(x)
		ner_tags = out['sentences'][0]['ner']
		num_tokens = len(ner_tags)
		lemmas = out['sentences'][0]['lemmas']
		first_indexes = (i for i in xrange(num_tokens) if ner_tags[i] == "PERSON" and (i == 0 or ner_tags[i-1] != "PERSON"))
		for begin_index in first_indexes:
		    # find the end of the PERSON phrase (consecutive tokens tagged as PERSON)
		    end_index = begin_index + 1
		    while end_index < num_tokens and ner_tags[end_index] == "PERSON":
		    	end_index += 1
		    end_index -= 1
		    mention_text = " ".join(map(lambda i: lemmas[i], xrange(begin_index, end_index + 1)))
		    print("%s %s" % (filename, mention_text))
		    output.write("%s\n" % mention_text)
	except IndexError:
		pass
예제 #51
0
 def __init__(self):
     """
     Load and initialize any external models or data here
     """
     self.corenlp = CoreNLP("pos", corenlp_jars=["./enrichments/stanford-corenlp-full-2015-12-09/*"])
예제 #52
0
파일: token.py 프로젝트: rpongsaj/reflector
from stanford_corenlp_pywrapper import CoreNLP
from pprint import pprint
import glob

proc = CoreNLP("ssplit", corenlp_jars=["stanford/stanford-corenlp-full-2015-04-20/*"])
path = 'data/engelhard/0/'
for filename in glob.glob(path+'*.txt'):
  print(filename)
  with open(filename, 'rU') as f:
    engelhard = f.read()
    engelhard2 = engelhard.decode('utf8', 'ignore')
    print(engelhard2)
    a = proc.parse_doc(engelhard2)
  pprint(a['sentences'][0]['tokens'])
예제 #53
0
파일: ner.py 프로젝트: edvisees/EDL2015
class NER:
    def __init__(self, lang):
        self.lang = lang
        self.config = ner_config

    def start_server(self):
        self.corenlp = CoreNLP(
            corenlp_jars=[
                os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_jar"]),
                os.path.join(self.config["CORENLP_HOME"], self.config[self.lang]["corenlp_models_jar"]),
            ],
            server_port=self.config[self.lang]["port"],
            configdict=self.config[self.lang]["properties"],
        )
        print "Serving on http://%s:%s" % ("localhost", self.config[self.lang]["port"])

    # text = [paragraphs] (one per line)
    def query(self, text):
        if self.lang == "CMN":
            return self.stanford_ner(text)
        if self.lang == "SPA":
            return self.freeling_ner(text)
        if self.lang == "ENG":
            return self.stanford_ner(text)

    def stanford_ner(self, text):
        mentions = []
        for paragraph in text:
            paragraph_mentions = []
            response = self.corenlp.parse_doc(paragraph)
            sentences = response["sentences"]
            # print '\n\n', paragraph
            for sentence in sentences:
                paragraph_mentions.extend(self.process_stanford_sentence(sentence))
            mentions.append(paragraph_mentions)
        return mentions

    def process_stanford_sentence(self, sentence):
        mentions = []
        for index, word in enumerate(sentence["tokens"]):
            ner_type = sentence["ner"][index]
            if ner_type in stanford_good_entity_types:
                if index > 0 and sentence["ner"][index - 1] == ner_type:
                    # concat this token with the previous
                    mentions[-1].word += (
                        " " + word
                    )  # TODO: this is buggy, think of a better way (perhaps using the offsets and sentence.substring(start, end))
                    mentions[-1].end = sentence["char_offsets"][index][1]
                else:
                    mentions.append(
                        Mention(
                            word,
                            sentence["char_offsets"][index][0],
                            sentence["char_offsets"][index][1],
                            ner_type,
                            "name",
                            "link",
                        )
                    )
        return mentions

    def freeling_ner(self, text, name):
        print "\n\nINPUT TEXT:", text
        entities = get_entities(text)
        mentions = []
        # build Mentions
        for (form, count, classification) in entities:
            print "FREELING FOUND: %s: %d | %s" % (form, count, classification)
            # word, begin, end, ner, name, link
            mentions.append(Mention(form, 0, 1, classification, "name", "link"))
        return mentions
예제 #54
0
files = dict()

os.makedirs("bio_output")
os.chdir(os.path.join(os.getcwd(),"bio_output"))

for i in range(len(doc)):
    files[file_names[i]] = doc[i].replace("\n"," ")
    file = open(file_names[i]+".txt", "w")
    file.write(doc[i])
    file.close()

#%% check set of non-word characters and stopword
os.chdir("/Users/apple/Documents/MSiA/Fall 2015/Text analytics/HW/hw3")

proc = CoreNLP("pos", corenlp_jars=["/Users/apple/corenlp/stanford-corenlp-full-2015-04-20/*"])

for i in files.keys():
    text = files[i]
    parsed = proc.parse_doc(text)
    to_flat = [x["lemmas"] for x in parsed["sentences"]]
    words = [item for sublist in to_flat for item in sublist]
    files[i] = words
#%%
#import stopwords from nltk stopwords, add 'I' since it is also counted as a stopword
stopWord = set(stopwords.words('english'))
stopWord.add("I")
all_words = []
nonWords = re.compile(r"^\b[a-zA-Z]+-?[a-zA-z]+$")

for i in files.keys():
"""
Input is multiple text files.  Each text file represents one document.
Output is just as many text files, with the ".anno" extension instead.
Each output file consists of one JSON object.

USAGE
proc_text_files.py MODE  [files...]

e.g.
python proc_text_files.py pos *.txt
"""

import sys, re
mode = sys.argv[1]

from stanford_corenlp_pywrapper import CoreNLP
ss = CoreNLP(mode, corenlp_jars=["/Users/Doctor_Einstein/Documents/stockMartket/analysis/nlp/stanford/*"])

for filename in sys.argv[2:]:
    outfile = re.sub(r'\.txt$',"", filename) + ".anno"
    print>>sys.stderr, "%s  ->   %s" % (filename, outfile)

    text = open(filename).read().decode('utf8', 'replace')
    jdoc = ss.parse_doc(text, raw=True)
    with open(outfile, 'w') as fp:
        print>>fp, jdoc


예제 #56
0
import csv
import json
import sys
import os
import fstphrases

from stanford_corenlp_pywrapper import CoreNLP

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--corpus', help='the thing in the middle of corpus/{}/raw', required=True)
parser.add_argument('--nlpjar', help='where is core nlp?', required=True)
parser.add_argument('--tagset', help='np fst tag set', required=False)
args = parser.parse_args()

proc = CoreNLP("parse", corenlp_jars=[args.nlpjar + '/*'])

try:
    os.remove('corpora/' + args.corpus + '/processed/all.anno_plus')
except:
    pass


if args.tagset is None:
    print "[*] using default tagset for npfst"


def get_phrases(tags, toks):
    '''extract phrases with npfst'''
    phrases = fstphrases.extract_from_poses(tags, 'NP', tagset=args.tagset)
    phrases_deets = []