Пример #1
0
    def __init__(self, document, pipeline):
        self._document = document

        self._processors = pipeline["tools"]["stanfordcorenlp"]["processors"]
        self._lang = pipeline["lang"]

        self.nlp = StanfordCoreNLP(self.PATH_OR_HOST)
Пример #2
0
def parse_each_sent(worker_id, refs):
    parser = StanfordCoreNLP(parser_path)
    parse_result = {}
    for ref in refs:
        for sent in ref['sentences']:
            sent_id = sent['sent_id']
            to_be_parse = sent['sent']
            parse_result[sent_id] = parser.raw_parse(to_be_parse)['sentences'][0]
            print 'mpId_%s, refId_%s, sentId_%s done.' % (worker_id, ref['ref_id'], sent['sent_id'])
    with open(osp.join(tmp_folder, 'parse_result_'+str(worker_id)+'.p'), 'w') as outfile:
        pickle.dump(parse_result, outfile)
Пример #3
0
class Plugin(IPlugin):

    from os import path
    PATH_OR_HOST = path.abspath(path.dirname(__file__)) + "/resources"

    def __init__(self, document, pipeline):
        self._document = document

        self._processors = pipeline["tools"]["stanfordcorenlp"]["processors"]
        self._lang = pipeline["lang"]

        self.nlp = StanfordCoreNLP(self.PATH_OR_HOST)

    def run(self):
        from deepnlpf.core.boost import Boost
        doc = Boost().multithreading(self.wrapper, self._document)
        self.nlp.close()
        return doc

    def wrapper(self, sentence):
        """
            @param annotators : more: https://stanfordnlp.github.io/CoreNLP/annotators.html
            @param pipelineLanguage : en, zh, ar, fr, de, es
            @param outputFormat : json, xml, text, more: https://stanfordnlp.github.io/CoreNLP/human-languages.html
            @param memory : 8g
        """

        props = {
            "timeout": "1500000",
            "annotators": ", ".join(self._processors),
            "pipelineLanguage": "en",
            "outputFormat": "json",
        }

        return self.nlp.annotate(sentence, properties=props)

    def out_format(self, doc):
        pass
Пример #4
0
    #     self.r5 = ['none'] if len(self.r5) == 0 else self.r5
    #     self.r6 = ['none'] if len(self.r6) == 0 else self.r6
    #     self.r7 = ['none'] if len(self.r7) == 0 else self.r7
    #
    #     # left words -> r8
    #     left_wds = [word[0] for word in self.leftWords()]
    #     self.r8 = ['none'] if len(left_wds) == 0 else left_wds
    #
    #     return {'r1': self.r1, 'r2': self.r2, 'r3': self.r3, 'r4': self.r4, 'r5': self.r5, 'r6': self.r6, 'r7': self.r7, 'r8': self.r8}


if __name__ == '__main__':
    import sys
    from pprint import pprint
    import os.path as osp
    ROOT_DIR = osp.abspath('/playpen/licheng/Documents/referit')
    sys.path.insert(0, osp.join(ROOT_DIR, 'lib', 'utils'))
    from corenlp.corenlp import StanfordCoreNLP
    parser_path = osp.join(ROOT_DIR, 'lib', 'utils', 'corenlp',
                           'stanford-corenlp-full-2015-01-30')
    stanfordParser = StanfordCoreNLP(parser_path)

    sent = 'woman in red shirt'
    parse = stanfordParser.raw_parse(sent)['sentences'][0]
    pprint(parse['dependencies'])

    attParser = ClefParser()
    attParser.reset(parse)
    pprint(attParser.decompose())
    pprint(attParser.leftWords())