def tagging(file):
    qstn = []
    qstn_ner = []
    qstn_pos = []
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    f = open(file)
    for line in f:
        qstn.append(line)
        #Saving pos tagging to list
        qstn_pos.append(pos_tagger.tag(line.split()))
        #Saving NER to list
        qstn_ner.append(ner_tagger.tag(line.split()))
    return qstn, qstn_ner, qstn_pos
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    #print("ORIGINAL TEXT ---------------",text)
    #tagged_text = tag(text)
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tagged_text = pos_tagger.tag(text.split())

    #tagged_text = nltk.pos_tag(text)

    #print("TAGGED TEXT ---------",tagged_text)

    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in tagged_text]

    return tagged_lower_text
示例#3
0
def parseSentenceStructure(data):

    #Tokenize sent.
    tokens = nltk.word_tokenize(data)

    #Tag sent.
    tagged = nltk.pos_tag(tokens)

    #Parser
    parser = CoreNLPParser(
        url='http://localhost:9000'
    )  #https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk/51981566#51981566
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

    #Parse w/ Stanford
    tree = parser.raw_parse(data)
    #print(list(tree))

    #list(tree)[0].pretty_print()
    #print(list(tree))

    #Provide N-V-N relationships w/ all N combinations

    #Traverse for NP root
    tree_recurse_find(list(tree)[0])
示例#4
0
def PVD(document):
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    sentence = sent_tokenize(document)
    word = 'se'
    lemma = 'definir'
    for i in range(len(sentence)):
        #print('--------------------')
        pattern = list()
        postaglist = list()
        tokens = nltk.word_tokenize(sentence[i])
        tag = pos_tagger.tag(tokens)
        for t in tag:
            if ('se' in tokens):
                pos = tokens.index('se')
                front = tokens[pos + 1:pos + 2]
                tag = pos_tagger.tag(front)

                doc = nlp(t[0])
                lemlist = [tok.lemma_ for tok in doc]
                #lem=''.join(lemlist)
                #lemmas_list.append(lem)
                #print(lemma, '-', lemlist)
                if ('definir' in lemlist or 'entender' in lemlist
                        or 'denominar' in lemlist):
                    #print(sentence[i])
                    front = tokens[pos + 2:pos + 5]
            if (t[1] == 'PUNCT'):
                pos = tokens.index(t[0])
                print(t[0], pos, tag[pos + 1])
            '''if(t[1]=='AUX'):
 def init_server(self):
     STANDFORD = os.path.join("stanford-corenlp-full-2018-10-05")
     self.server = CoreNLPServer(
         os.path.join(STANDFORD, "stanford-corenlp-3.9.2.jar"),
         os.path.join(STANDFORD, "stanford-corenlp-3.9.2-models.jar"))
     self.server.start()
     self.parser = CoreNLPParser()
示例#6
0
def _parseSentences(sentences, parseCacheFile):
    def cacheKey(text):
        return text.strip().encode('utf-8')

    cache = shelve.open(parseCacheFile)

    toParse = []
    for sentence in sentences:
        if cacheKey(sentence) not in cache:
            toParse.append(sentence)

    if toParse:
        p = Pool(10)

        parser = CoreNLPParser(
            url=os.getenv("CORENLP_HOST", "http://localhost:9000"))

        parseIterator = p.imap(lambda s: parser.parse_one(s.split()), toParse)

        progress = ProgressBar(len(toParse))
        for i, parse in enumerate(parseIterator):
            cache[cacheKey(toParse[i])] = parse
            progress.done(i)
        progress.complete()

    parses = map(lambda s: cache[cacheKey(s)], sentences)
    cache.close()

    return parses
示例#7
0
    def __init__(self, w2v_path, corpus_dict_path, port=9000):
        # corenlp client
        self.parser = CoreNLPParser(url='http://localhost:' + str(port))
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:' +
                                                  str(port))
        # w2v
        self.word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
            'data/saved_models/GoogleNews-vectors-negative300.bin',
            binary=True)
        print('w2v model loaded')
        # training corpus for one hot features
        corpus_dict = pickle.load(open(corpus_dict_path, 'rb'))

        self.dep_tuple_vectorizer = DictVectorizer(sparse=False)
        self.dep_tuple_vectorizer = self.dep_tuple_vectorizer.fit(
            corpus_dict['dep_tuple'])

        self.unigram_vectorizer = DictVectorizer(sparse=False)
        self.unigram_vectorizer = self.unigram_vectorizer.fit(
            corpus_dict['unigram'])

        self.bigram_vectorizer = DictVectorizer(sparse=False)
        self.bigram_vectorizer = self.bigram_vectorizer.fit(
            corpus_dict['bigram'])

        self.trigram_vectorizer = DictVectorizer(sparse=False)
        self.trigram_vectorizer = self.trigram_vectorizer.fit(
            corpus_dict['trigram'])

        self.lexical_vectorizer = DictVectorizer(sparse=False)
        self.lexical_vectorizer = self.lexical_vectorizer.fit(
            corpus_dict['lexical'])
示例#8
0
def dependency_parse(raw_data):
    from nltk.parse.corenlp import CoreNLPServer

    # The server needs to know the location of the following files:
    #   - stanford-corenlp-X.X.X.jar
    #   - stanford-corenlp-X.X.X-models.jar
    STANFORD = os.path.join("..", "stanford-corenlp-full-2020-04-20")

    # Create the server
    server = CoreNLPServer(
        os.path.join(STANFORD, "stanford-corenlp-4.0.0.jar"),
        os.path.join(STANFORD, "stanford-corenlp-4.0.0-models.jar"),
    )

    # Start the server in the background
    server.start()
    from nltk.parse import CoreNLPParser
    parser = CoreNLPParser()

    new_data = []
    for example in raw_data:
        sentence, features_seq = example[0], example[-1]
        parse = next(parser.raw_parse(sentence))
        # get a few "important" neighboring words

    server.stop()
def check_triples_by_pos(triples):
    pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos')
    ret_triples = []
    for triple in triples:
        source = triple[0]
        relation = triple[1]
        target = triple[2]
        source_pos = ",".join(
            [e[1] for e in pos_tagger.tag(source.split(" "))])
        relation_pos = ",".join(
            [e[1] for e in pos_tagger.tag(relation.split(" "))])
        target_pos = ",".join(
            [e[1] for e in pos_tagger.tag(target.split(" "))])

        if "VB" in source_pos or "VB" in target_pos:
            continue
        if "NN" not in source_pos or "NN" not in target_pos:
            continue
        if "NN" in relation_pos:
            if " at" in relation.lower():
                relation = "at"
            elif "of" not in relation.split(" ") and len(
                    relation.split(" ")) > 1:
                continue

        ret_triples.append([source, relation, target])

    return ret_triples
示例#10
0
def pos_tagger_lemma(document, listterms):
    print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram')
    text = str()
    definiendums = list()
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    for i in document:
        if (len(i) > 1):
            tag = pos_tagger.tag(i.split(' '))
            for t in tag:
                if (t[1] == 'VERB'):
                    doc = nlp(t[0])
                    for tok in doc:
                        l = tok.lemma_
                        if (l == 'ser'):
                            text = i
                            indverb = i.index(t[0])
                            front = i[indverb:]
                            back = i[:indverb + len(t[0]) + 1]
                            tagfront = pos_tagger.tag(front.split(' '))
                            tagback = pos_tagger.tag(back.split(' '))
                            definiendum_definition(t[0], text, listterms)

                elif (t[1] == 'NOUN' and t[0] != '=RRB='):
                    text = i
                    if (len(t[0]) > 1):
                        #definiendum_definition(t[0], text, listterms)
                        pass

    return (text)
示例#11
0
def text_2_triple_list(text, strength):
    nlp = spacy.load("en")
    neuralcoref.add_to_pipe(nlp)
    api = CoreNLPParser(url='http://39.98.186.125:9000')
    api.parser_annotator = "tokenize,ssplit,coref,openie"
    parser = CoreNLPParser(url='http://39.98.186.125:9000')

    text = clean_text(text)
    text = remove_adjective_possessive_pronoun(text)
    doc = nlp(text)
    text = doc._.coref_resolved
    entities = []
    entities_labels = []
    for e in doc.ents:
        if e.label_ in supported_entity_types:
            entities.append(e.text)
            entities_labels.append(e.label_)

    json_text = api.api_call(text)
    openie_sentences = ssplit_article_into_sentences(json_text, step=-1)
    syntax_sentences = ssplit_article_into_sentences(json_text, step=1)
    triples = []
    for sentence in openie_sentences:
        json_sentence = api.api_call(sentence)
        triples += extract_triples_by_openie(json_sentence)
    syntax_triples = []
    for sentence in syntax_sentences:
        syntax_tree = list(parser.raw_parse(sentence))[0]
        cur_syntax_triples = []
        parse_tree_v2(syntax_tree, cur_syntax_triples)
        syntax_triples += cur_syntax_triples

    triples = filter_triples_by_entities(triples, entities, strength)
    triples = beautify_triples(triples)
    triples = remove_meaningless_triples(triples)
    triples = check_triples_by_pos(triples)
    triples = remove_duplicate_triples(triples)

    syntax_triples = beautify_triples(syntax_triples)
    syntax_triples = remove_meaningless_triples(syntax_triples)
    syntax_triples = check_triples_by_pos(syntax_triples)
    syntax_triples = remove_duplicate_triples(syntax_triples)

    triples = normalize_entities(triples, syntax_triples)
    triples = remove_duplicate_triples(triples)

    return generate_structured_triples(triples, entities, entities_labels)
示例#12
0
def pruebas():
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    tag = pos_tagger.tag(
        'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente'
        .split(' '))
    doc = nlp('considerará')
    lemlist = [tok.lemma_ for tok in doc]
    print(lemlist)
示例#13
0
 def __init__(self,
              host='services.loadbalancer.api.questo.ai',
              port=9000,
              separator='|'):
     # self.parser = CoreNLPParser(url=f'http://{host}:{port}')
     self.parser = CoreNLPParser(
         url='http://services.loadbalancer.api.questo.ai:9000')
     self.separator = separator
def scorer(title, speaker):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    pos_tag_list = list(pos_tagger.tag(title[0].split()))
    s = 0
    for i in pos_tag_list:
        if 'NN' in i[1] or 'NP' in i[1]:
            s += 1
    return -s + abs(title[1] - speaker[1]) * 0.3
示例#15
0
 def __init__(self, config):
     self.ontology_tagging = OntologyTagging()
     self.config = config
     self.word_dictionary = self.compute_all_embeddings()
     self.server_url = 'http://localhost:9000'
     self.parser = CoreNLPParser(url=self.server_url)
     self.core_nlp_dependency_parser = CoreNLPDependencyParser(
         url=self.server_url)
示例#16
0
 def clear_data(self):
     self.parser = CoreNLPParser(url='http://localhost:9000')
     self.first_NP = ''
     self.first_VP = ''
     self.parse_tree = None
     self.subject = RDF_Triple.RDF_SOP('subject')
     self.predicate = RDF_Triple.RDF_SOP('predicate', 'VB')
     self.Object = RDF_Triple.RDF_SOP('object')      
示例#17
0
文件: tree.py 项目: sushe-shakya/RNTN
def parse(text):
    parser = CoreNLPParser("http://localhost:9000")
    result = parser.raw_parse(text.lower())
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
def verb_stats(data):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    verb_count = 0
    for _, value in data.items():
        pos = list(pos_tagger.tag(value.split()))
        for _, second in pos:
            if second.startswith("V"):
                verb_count += 1
    print(verb_count)
示例#19
0
文件: tree.py 项目: tTeha/MRMARS
def parse(text):
    parser = CoreNLPParser(CORENLP_SERVER)
    result = parser.raw_parse(text)
    trees = [tree for tree in result]
    for tree in trees:
        tree.chomsky_normal_form()
        tree.collapse_unary(collapseRoot=True, collapsePOS=True)
    trees = [ParentedTree.convert(tree) for tree in trees]
    return trees
示例#20
0
 def syntax_tree_parser(self):
     """
     get syntax tree
     :return: syntax tree
     """
     if self.syntax_tree is not None:
         return self.syntax_tree
     parser = CoreNLPParser(url='http://localhost:8999')
     self.syntax_tree = list(parser.parse(nltk.word_tokenize(self.text)))[0]
     return self.syntax_tree
def get_probable_title(titles):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    score = []
    for title in titles:
        pos_tag_list = list(pos_tagger.tag(title[0].split()))
        s = 0
        for i in pos_tag_list:
            if 'NN' in i[1] or 'NP' in i[1]:
                s += 1
        score.append((s, len(title[0]), title[0], title[1]))
    return max(score)
 def add_pos_tag(self, words, tech_pair):
     tagged_words = CoreNLPParser(url='http://localhost:9000',
                                  tagtype='pos').tag(words)
     # print tagged_words
     tag_list = []
     for (word, tag) in tagged_words:
         if word in tech_pair.split("\t"):
             tag_list.append("TECH")
         else:
             tag_list.append(tag)
     return tag_list
示例#23
0
 def __init__(self):
     """
     Initialize the SVO Methods
     """
     self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"]
     self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
     self.adjective_types = ["JJ", "JJR", "JJS"]
     self.pred_verb_phrase_siblings = None
     jar = r'D:\data'
     model = r'your_path/stanford-postagger-full-2016-10-31/models/english-left3words-distsim.tagger'
     self.parser = CoreNLPParser(url='http://localhost:9000')
     self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
示例#24
0
def get_speaker_salutaion(text, persons, speakers):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    text = text.split()
    pos_tag_list = list(pos_tagger.tag(text))
    for i in range(len(pos_tag_list) - 1):
        if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons:
            if pos_tag_list[i + 1] in persons:
                for speak in speakers:
                    if pos_tag_list[i + 1] in speak:
                        speak = pos_tag_list[i] + ' ' + speak
                        break
    return speakers
示例#25
0
def run():
    rule = TimeExpressions()
    parser = CoreNLPParser(url='http://localhost:9000')

    input = "They will play against a team of North Koreans on Wednesday , which is believed to be Kim 's birthday ."
    exp_output = "They will play against a team of North Koreans ."
    one_test(rule, parser, input, exp_output)

    input = "The pitch was effective ."
    one_test(rule, parser, input)

    input = "Patti Davis , a former first daughter , wrote a public letter to Malia and Sasha on Sunday ."
    exp_output = "Patti Davis , a former first daughter , wrote a public letter to Malia and Sasha ."
    one_test(rule, parser, input, exp_output)
示例#26
0
def get_stanford_pos_tags(line):
    """
    Get part of speech tags using the Stanford POS tagger
    """

    st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos")
    tokenized_line = cnf.TOKENIZER.tokenize(line)
    line_tagged_initial = st_pos.tag(tokenized_line)
    line_tagged_output = []

    for item in line_tagged_initial:
        line_tagged_output.append((item[0], item[1]))

    return line_tagged_output
示例#27
0
 def createGrammar(self, userMessages, ctx):
     parser = CoreNLPParser(url='http://localhost:9000')
     parse_trees = []
     for message in userMessages:
         tokenized = nltk.sent_tokenize(message)
         for sentence in tokenized:
             parse_trees.append(list(parser.raw_parse(sentence))[0])
     grammar_rules = set()
     for tree in parse_trees:
         for production in tree.productions():
             grammar_rules.add(production)
     start = nltk.Nonterminal('S')
     grammar = nltk.induce_pcfg(start, grammar_rules)
     return (' '.join((self.generate_sentence(grammar))))
示例#28
0
    def __init__(self, tool, file):

        self.t_name = tool
        self.t_synset = wn.synset(self.t_name)
        self.hypernyms = []
        self.level = 0

        self.file = file

        self.parser = CoreNLPParser(url='http://localhost:9000')
        self.dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

        self.telic_roles = []
        self.first_words = []
示例#29
0
def get_entities(tweets):
    entities = []
    for atweet in tweets:
        for sent in atweet:
            pos_tagger = CoreNLPParser(url='http://localhost:9000',
                                       tagtype='pos')
            sent = list(pos_tagger.tag(normalize(sent)))
            #             sent = pos_tag(normalize(sent))
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities
示例#30
0
def run():
    rule = PreposedAdjuncts()
    parser = CoreNLPParser(url='http://localhost:9000')

    input = "The pitch was effective ."
    one_test(rule, parser, input)

    input = "According to a now finalized blueprint described by U.S. officials and other sources, " \
           "the Bush administration plans to take complete, unilateral control of a post-Saddam Hussein Iraq."
    exp_output = "the Bush administration plans to take complete , unilateral control of a post-Saddam Hussein Iraq ."
    one_test(rule, parser, input, exp_output)

    input = "While never forgetting her Chinese roots , Yee has grown to love America over the years ."
    exp_output = "Yee has grown to love America over the years ."
    one_test(rule, parser, input, exp_output)