Exemplo n.º 1
0
def PVD(document):
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    sentence = sent_tokenize(document)
    word = 'se'
    lemma = 'definir'
    for i in range(len(sentence)):
        #print('--------------------')
        pattern = list()
        postaglist = list()
        tokens = nltk.word_tokenize(sentence[i])
        tag = pos_tagger.tag(tokens)
        for t in tag:
            if ('se' in tokens):
                pos = tokens.index('se')
                front = tokens[pos + 1:pos + 2]
                tag = pos_tagger.tag(front)

                doc = nlp(t[0])
                lemlist = [tok.lemma_ for tok in doc]
                #lem=''.join(lemlist)
                #lemmas_list.append(lem)
                #print(lemma, '-', lemlist)
                if ('definir' in lemlist or 'entender' in lemlist
                        or 'denominar' in lemlist):
                    #print(sentence[i])
                    front = tokens[pos + 2:pos + 5]
            if (t[1] == 'PUNCT'):
                pos = tokens.index(t[0])
                print(t[0], pos, tag[pos + 1])
            '''if(t[1]=='AUX'):
Exemplo n.º 2
0
def pos_tagger_lemma(document, listterms):
    print('Definición por pos tagger y lemma, busqueda de 3,2 y 1 gram')
    text = str()
    definiendums = list()
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    for i in document:
        if (len(i) > 1):
            tag = pos_tagger.tag(i.split(' '))
            for t in tag:
                if (t[1] == 'VERB'):
                    doc = nlp(t[0])
                    for tok in doc:
                        l = tok.lemma_
                        if (l == 'ser'):
                            text = i
                            indverb = i.index(t[0])
                            front = i[indverb:]
                            back = i[:indverb + len(t[0]) + 1]
                            tagfront = pos_tagger.tag(front.split(' '))
                            tagback = pos_tagger.tag(back.split(' '))
                            definiendum_definition(t[0], text, listterms)

                elif (t[1] == 'NOUN' and t[0] != '=RRB='):
                    text = i
                    if (len(t[0]) > 1):
                        #definiendum_definition(t[0], text, listterms)
                        pass

    return (text)
def check_triples_by_pos(triples):
    pos_tagger = CoreNLPParser(url='http://39.98.186.125:9000', tagtype='pos')
    ret_triples = []
    for triple in triples:
        source = triple[0]
        relation = triple[1]
        target = triple[2]
        source_pos = ",".join(
            [e[1] for e in pos_tagger.tag(source.split(" "))])
        relation_pos = ",".join(
            [e[1] for e in pos_tagger.tag(relation.split(" "))])
        target_pos = ",".join(
            [e[1] for e in pos_tagger.tag(target.split(" "))])

        if "VB" in source_pos or "VB" in target_pos:
            continue
        if "NN" not in source_pos or "NN" not in target_pos:
            continue
        if "NN" in relation_pos:
            if " at" in relation.lower():
                relation = "at"
            elif "of" not in relation.split(" ") and len(
                    relation.split(" ")) > 1:
                continue

        ret_triples.append([source, relation, target])

    return ret_triples
Exemplo n.º 4
0
def tagging(file):
    qstn = []
    qstn_ner = []
    qstn_pos = []
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    f = open(file)
    for line in f:
        qstn.append(line)
        #Saving pos tagging to list
        qstn_pos.append(pos_tagger.tag(line.split()))
        #Saving NER to list
        qstn_ner.append(ner_tagger.tag(line.split()))
    return qstn, qstn_ner, qstn_pos
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    #print("ORIGINAL TEXT ---------------",text)
    #tagged_text = tag(text)
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tagged_text = pos_tagger.tag(text.split())

    #tagged_text = nltk.pos_tag(text)

    #print("TAGGED TEXT ---------",tagged_text)

    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in tagged_text]

    return tagged_lower_text
Exemplo n.º 6
0
def pruebas():
    pos_tagger = CoreNLPParser('http://localhost:9003', tagtype='pos')
    tag = pos_tagger.tag(
        'tengo que ir Por el contrato de compra y venta uno de los contratantes se obliga a entregar una cosa determinada y el otro a pagar por ella un precio cierto, en dinero o signo que lo represente'
        .split(' '))
    doc = nlp('considerará')
    lemlist = [tok.lemma_ for tok in doc]
    print(lemlist)
def scorer(title, speaker):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    pos_tag_list = list(pos_tagger.tag(title[0].split()))
    s = 0
    for i in pos_tag_list:
        if 'NN' in i[1] or 'NP' in i[1]:
            s += 1
    return -s + abs(title[1] - speaker[1]) * 0.3
def verb_stats(data):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    verb_count = 0
    for _, value in data.items():
        pos = list(pos_tagger.tag(value.split()))
        for _, second in pos:
            if second.startswith("V"):
                verb_count += 1
    print(verb_count)
def get_probable_title(titles):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    score = []
    for title in titles:
        pos_tag_list = list(pos_tagger.tag(title[0].split()))
        s = 0
        for i in pos_tag_list:
            if 'NN' in i[1] or 'NP' in i[1]:
                s += 1
        score.append((s, len(title[0]), title[0], title[1]))
    return max(score)
Exemplo n.º 10
0
def get_speaker_salutaion(text, persons, speakers):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    text = text.split()
    pos_tag_list = list(pos_tagger.tag(text))
    for i in range(len(pos_tag_list) - 1):
        if pos_tag_list[i][1] == 'NNP' and pos_tag_list[i][0] not in persons:
            if pos_tag_list[i + 1] in persons:
                for speak in speakers:
                    if pos_tag_list[i + 1] in speak:
                        speak = pos_tag_list[i] + ' ' + speak
                        break
    return speakers
Exemplo n.º 11
0
def get_stanford_pos_tags(line):
    """
    Get part of speech tags using the Stanford POS tagger
    """

    st_pos = CoreNLPParser(url="http://localhost:9000", tagtype="pos")
    tokenized_line = cnf.TOKENIZER.tokenize(line)
    line_tagged_initial = st_pos.tag(tokenized_line)
    line_tagged_output = []

    for item in line_tagged_initial:
        line_tagged_output.append((item[0], item[1]))

    return line_tagged_output
Exemplo n.º 12
0
def get_entities(tweets):
    entities = []
    for atweet in tweets:
        for sent in atweet:
            pos_tagger = CoreNLPParser(url='http://localhost:9000',
                                       tagtype='pos')
            sent = list(pos_tagger.tag(normalize(sent)))
            #             sent = pos_tag(normalize(sent))
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities
Exemplo n.º 13
0
def sfNERTagger(rawText):
	'''(sf = stanford) get the raw text from a file and convert that to a list with tuples of each word with a StanFord annotated NER-tag'''
	parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
	tupleList = list(parser.tag(rawText.split()))
	#convert list of tuple to list of lists, so we can change tags we dont need
	NERList = [list(tuple) for tuple in tupleList]

	#change tags we dont need
	for item in NERList:
		if item[1] == 'COUNTRY': item[1] = 'COU'
		elif item[1] == 'PERSON': item[1] = 'PER'
		elif item[1] == 'CITY': item[1] = 'CIT'
		elif item[1] == 'ORGANIZATION': item[1] = 'ORG'
		else: item[1] = ''

	return NERList
Exemplo n.º 14
0
def ed_rip(word: str):
    NV = False
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    nlpinfo = nlp(word.lower())
    ripword = nlpinfo.sentences[0].words[0].lemma
    # Recapitalization
    if re.search('^[A-Z]', word) != None:
        ripword = ripword.capitalize()
    # Return information needed to determine NV Passive.
    riptoken = nltk.word_tokenize(ripword)
    riptag = pos_tagger.tag(riptoken)[0][1]
    print(riptoken, riptag)
    if riptag.startswith('V') is True:
        NV = False
    elif riptag.startswith('N') is True:
        NV = True
    return (ripword, NV)
Exemplo n.º 15
0
def punctuation_funct(document, listterms):
    print(
        'Definición por punctuación [:, termino seguido de coma y acabando en coma verbo'
    )
    text = str()
    definiendum = str()
    definiendums = list()
    for i in document:
        for j in listterms:
            term = j[:-1]
            if (len(i) > 1):
                if (term + ':' in i):
                    ind = i.index(':')
                    after = i[ind + 1:]
                    if (len(after) > 1 and term not in definiendums):
                        definiendum = term
                        definition = after
                        definiendums.append(definiendum)
                        print(definiendum, '---->', definition)

                elif (term + ',' in i):
                    indterm = i.index(term)
                    if (',' in i[indterm + len(term):indterm + len(term) + 1]):
                        #print('-')
                        front = i[indterm:-1]
                        pos_tagger = CoreNLPParser('http://localhost:9003',
                                                   tagtype='pos')
                        tag = pos_tagger.tag(i.split(' '))
                        for t in tag:
                            if (t[1] == 'VERB'):
                                #print(front)
                                if (t[0] in i):
                                    #print(t[0])
                                    indverb = i.index(t[0])
                                    if (i[indverb - 2] == ','):
                                        definiendum = term
                                        definition = i[indterm + len(term) +
                                                       1:indverb]
                                        if (len(definiendum) > 1
                                                and len(definition) > 1
                                                and definiendum
                                                not in definiendums):
                                            definiendums.append(definiendum)
                                            print(definiendum, '---->',
                                                  definition)
Exemplo n.º 16
0
def test(sent_pool, t):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    for s in sent_pool:
        result = NV_Passive(pos_tagger.tag(word_tokenize(s)))
        print(result)
        if t == 'f':
            if result[1] == False and result[2] == False:
                print('False test passed.')
            else:
                print('▇▇False test failed.')
        elif t == 'p':
            if result[1] == True and result[2] == False:
                print('Passive test passed.')
            else:
                print('▇▇Passive test failed.')
        elif t == 't':
            if result[1] == True and result[2] == True:
                print('True test passed.')
            else:
                print('▇▇True test failed.')
Exemplo n.º 17
0
def get_stanford_named_entities(line):
    """
	Get named entities from the Stanford NER tagger
	"""

    entity_item = ""
    entity_list = []
    previous_type = ""

    st_ner = CoreNLPParser(url="http://localhost:9000", tagtype="ner")
    line_tagged_ner = st_ner.tag(line.split())
    line_tagged = []

    # Tag the input using the tagger
    # The tagger returns in the order (entity, type). Change the
    # order to be consistent with the other taggers (type, entity)
    for item_ner in line_tagged_ner:
        if item_ner[1] != "O":
            line_tagged.append((item_ner[1], item_ner[0]))

    # Consolidate multi-word entities
    if len(line_tagged) == 1:
        entity_list = line_tagged
    elif len(line_tagged) > 1:
        for index, item in enumerate(line_tagged):
            if item[0] == previous_type:
                if item[0] != "O":
                    entity_item += " " + item[1]
            else:
                if item[0] != "O":
                    entity_item = item[1]
                else:
                    entity_list.append((previous_type, entity_item))
                    entity_item = ""

            if index == (len(line_tagged) - 1):
                entity_list.append((previous_type, entity_item))

            previous_type = item[0]

    return entity_list
Exemplo n.º 18
0
def find_title_mentioned(title_lines):
    titles = []
    for line in title_lines:
        pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        pos_tag_list = list(pos_tagger.tag(line[0].split()))
        m = len(pos_tag_list)
        preposition_list = []
        for j in range(m):
            if pos_tag_list[j][0] == 'on':
                preposition_list.append(j)
        preposition_list.append(m)
        preposition_combinations = list(combinations(preposition_list, 2))
        for j in preposition_combinations:
            word_list = [j[0] for j in pos_tag_list]
            title = ' '.join(word_list[j[0] + 1:j[1] + 1])
            x = 0
            for k in range(len(title)):
                if title[k].lower().isalpha() == True:
                    x = k
                    break
            title = title[m:]
            titles.append((title, line[1]))
        preposition_list = []
        for j in range(m):
            if pos_tag_list[j][0] == 'is':
                preposition_list.append(j)
        preposition_list.append(m)
        preposition_combinations = list(combinations(preposition_list, 2))
        for j in preposition_combinations:
            word_list = [j[0] for j in pos_tag_list]
            title = ' '.join(word_list[j[0] + 1:j[1] + 1])
            x = 0
            for k in range(len(title)):
                if title[k].lower().isalpha() == True:
                    x = k
                    break
            title = title[x:]
            titles.append((title, line[1]))
    return titles
Exemplo n.º 19
0
Arquivo: ner.py Projeto: lstrgiang/NER
    def en_ner(self):
        ner_tagger = CoreNLPParser(url=DEFAULT_LOCAL_ADDRESS + ":" +
                                   DEFAULT_EN_NER_PORT,
                                   tagtype='ner')
        for line in self.textMap.keys():
            taggedText = ner_tagger.tag((line.split()))
            try:
                for text, value in taggedText:
                    if value in ['PERSON']:
                        self.textMap[line][self.PER_KEY] += 1
                    if value in ['LOCATION']:
                        self.textMap[line][self.LOC_KEY] += 1
                    if value in ['ORGANIZATION']:
                        self.textMap[line][self.ORG_KEY] += 1
                    if value in ['TITLE']:
                        self.textMap[line][self.TIT_KEY] += 1
                    if value in ['NUMBER']:
                        continue

            except Exception as e:
                print("Unable to anotate " + str(line))
                print(e)
                return e
Exemplo n.º 20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 16 00:28:43 2020

@author: mingxi
"""

from nltk.parse import CoreNLPParser

pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')

# extract pos tags for the Tedlium corpus
base_text = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/Base/Base_all.txt',
                 'r').read()
base_pos = []
for i in base_text.split('.'):
    base_pos.append(list(pos_tagger.tag((i + '.').split())))

base_pos2 = []
for i in base_pos:
    if i[0][0] != '.':
        for j in i:
            base_pos2.append(j[1])

out = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/base_pos.txt', 'w')
out.write('\n'.join(base_pos2))
out.close()
Exemplo n.º 21
0
def question_pipeline(question):

    lemmatizer = WordNetLemmatizer()
    porter = PorterStemmer()
    # stanford corenlp is expected to run at localhost:9000
    dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
    ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
    corpus_dict = {}
    count = 0
    sent_text = question
    tokenized_text = nltk.word_tokenize(sent_text)
    question_types = ['who', 'when', 'where', 'Who', 'When', 'Where']
    type_of_question = [i for i in question_types if i in tokenized_text]
    lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
    stemmed = [porter.stem(word)
               for word in tokenized_text]  # Stemming the words
    # POS tagging the words to extract POS features
    tagged = nltk.pos_tag(tokenized_text)
    parse, = dep_parser.raw_parse(question)
    # Dependency parsing to parse tree based patters as features
    dependency_parse = list(parse.triples())
    # LESK to extract best sense of a word
    best_sense = [lesk(question, word) for word in tokenized_text]
    # tokenized_text_ner = nltk.word_tokenize(sent_text) #Tokenizing sentences into words
    ner_tag = ner_tagger.tag(tokenized_text)
    head_list = []
    striped_sentence = sent_text.strip(" '\"")
    if striped_sentence != "":
        dependency_parser = dep_parser.raw_parse(striped_sentence)
        parsetree = list(dependency_parser)[0]
        head_word = ""
        head_word = [
            k["word"] for k in parsetree.nodes.values() if k["head"] == 0
        ][0]
        if head_word != "":
            head_list.append([head_word])
        else:
            for i, pp in enumerate(tagged):
                if pp.startswith("VB"):
                    head_list.append([tokenized_text[i]])
                    break
            if head_word == "":
                for i, pp in enumerate(tagged):
                    if pp.startswith("NN"):
                        head_list.append([tokenized_text[i]])
                        break
    else:
        head_list.append([""])
    synonym_list = []
    hypernym_list = []
    hyponym_list = []
    meronym_list = []
    holonym_list = []
    for t in tokenized_text:
        best_sense = lesk(sent_text, t)  # LESK to extract best sense of a word
        if best_sense is not None:
            this_synonym = t
            if best_sense.lemmas()[0].name() != t:
                this_synonym = best_sense.lemmas()[0].name()
            synonym_list.append(this_synonym)
            if best_sense.hypernyms() != []:
                hypernym_list.append(
                    best_sense.hypernyms()[0].lemmas()[0].name())
            if best_sense.hyponyms() != []:
                hyponym_list.append(
                    best_sense.hyponyms()[0].lemmas()[0].name())
            if best_sense.part_meronyms() != []:
                meronym_list.append(
                    best_sense.part_meronyms()[0].lemmas()[0].name())
            if best_sense.part_holonyms() != []:
                holonym_list.append(
                    best_sense.part_holonyms()[0].lemmas()[0].name())
        else:
            synonym_list.append(t)

    count = count + 1
    corpus_dict[count] = {}
    corpus_dict[count]["sentence"] = {}
    corpus_dict[count]["sentence"] = sent_text
    corpus_dict[count]["type_of_question"] = {}
    corpus_dict[count]["type_of_question"] = type_of_question
    corpus_dict[count]["tokenized_text"] = {}
    corpus_dict[count]["tokenized_text"] = tokenized_text
    corpus_dict[count]["lemma"] = {}
    corpus_dict[count]["lemma"] = lemma
    corpus_dict[count]["stemmed"] = {}
    corpus_dict[count]["stemmed"] = stemmed
    corpus_dict[count]["tagged"] = {}
    corpus_dict[count]["tagged"] = tagged
    corpus_dict[count]["dependency_parse"] = {}
    corpus_dict[count]["dependency_parse"] = dependency_parse
    corpus_dict[count]["synonyms"] = {}
    corpus_dict[count]["synonyms"] = synonym_list
    corpus_dict[count]["hypernyms"] = {}
    corpus_dict[count]["hypernyms"] = hypernym_list
    corpus_dict[count]["hyponyms"] = {}
    corpus_dict[count]["hyponyms"] = hyponym_list
    corpus_dict[count]["meronyms"] = {}
    corpus_dict[count]["meronyms"] = meronym_list
    corpus_dict[count]["holonyms"] = {}
    corpus_dict[count]["holonyms"] = holonym_list
    corpus_dict[count]["ner_tag"] = {}
    corpus_dict[count]["ner_tag"] = dict(ner_tag)
    corpus_dict[count]["head_word"] = {}
    corpus_dict[count]["head_word"] = head_list[0]
    return corpus_dict
Exemplo n.º 22
0
            trees = ne_chunk(sent)
            for tree in trees:
                if hasattr(tree, 'label'):
                    if tree.label() in labels:
                        entities.append(' '.join(
                            [child[0].lower() for child in tree]))
    return entities


# run this you have to connect to api
# go to dir - stanford-corenlp-full-2018-02-27
# the two lines below type in terminal as one line
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 15000 &

from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
list(parser.parse(doc))  # for sentence tokenized doc
list(parser.raw_parse(doc))  # for non tokenized docs

# on tokenized list of words
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
list(pos_tagger.tag(doc))

ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
list(ner_tagger.tag(doc))

from nltk.parse.corenlp import CoreNLPDependencyParser
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
list(dep_parser.parse(doc))
Exemplo n.º 23
0
def extractFeatures():
    stop_words = stopwords.words('english') + list(string.punctuation)
    file_loc='wikiTest/'
    os.chdir('/Users/ranjithreddykommidi/NLP/Project/wikiTest')
    file_names = glob.glob('*.txt')
    
    #Read every wikipedia articles given in the input fileList
    for file in file_names:
        readfile = open(file, 'r')
        text = readfile.read()
        corpus = {}
        sent_text = nltk.sent_tokenize(text)
        dep_parser = CoreNLPDependencyParser(url='http://localhost:9010')
        ner_tagger = CoreNLPParser(url='http://localhost:9010', tagtype='ner')
        count = 0
        for sentence in sent_text:
            tokenized_text = [i for i in nltk.word_tokenize(sentence.lower()) if i not in stop_words]  
            lemma = [WordNetLemmatizer().lemmatize(word) for word in tokenized_text]
            stemmed = [PorterStemmer().stem(word) for word in tokenized_text]
            tagged = nltk.pos_tag(tokenized_text)
            parse, = dep_parser.raw_parse(sentence)
            dependency_parse = list(parse.triples())
            tokenized_text_ner = nltk.word_tokenize(sentence) 
            try:
                ner_tag = ner_tagger.tag(tokenized_text_ner)
            except:
                ner_tag = ner_tagger.tag(tokenized_text)
            
            Synonym = []
            Hypernym = []
            Hyponym = []
            Meronym = []
            Holonym = []
            Heads = []
        
            for t in tokenized_text:
                Nyms = lesk(sentence, t)
                if Nyms is not None:
                    this_synonym = t
                    if Nyms.lemmas()[0].name() != t:this_synonym = Nyms.lemmas()[0].name()
                    Synonym.append(this_synonym)
                    if Nyms.hypernyms() != []:Hypernym.append(Nyms.hypernyms()[0].lemmas()[0].name())
                    if Nyms.hyponyms() != []:Hyponym.append(Nyms.hyponyms()[0].lemmas()[0].name())
                    if Nyms.part_meronyms() != []:Meronym.append(Nyms.part_meronyms()[0].lemmas()[0].name())
                    if Nyms.part_holonyms() != []:Holonym.append(Nyms.part_holonyms()[0].lemmas()[0].name())
                else:
                    Synonym.append(t)
        
            striped_sentence = sentence.strip(" '\"")
            if striped_sentence != "":
                dependency_parser = dep_parser.raw_parse(striped_sentence)
                parsetree = list(dependency_parser)[0]
                head_word = ""
                head_word = [k["word"]
                         for k in parsetree.nodes.values() if k["head"] == 0][0]
                if head_word != "":
                    Heads.append([head_word])
                else:
                    for i, pp in enumerate(tagged):
                        if pp.startswith("VB"):
                            Heads.append([tokenized_text[i]])
                            break
                    if head_word == "":
                        for i, pp in enumerate(tagged):
                            if pp.startswith("NN"):
                                Heads.append([tokenized_text[i]])
                                break
            else:
                Heads.append([""])

            count = count + 1
            corpus[count] = {}
            corpus[count]["sentence"] = {}
            corpus[count]["sentence"] = sentence
            corpus[count]["tokenized_text"] = {}
            corpus[count]["tokenized_text"] = tokenized_text
            corpus[count]["lemma"] = {}
            corpus[count]["lemma"] = lemma
            corpus[count]["stem"] = {}
            corpus[count]["stem"] = stemmed
            corpus[count]["tag"] = {}   
            corpus[count]["tag"] = tagged
            corpus[count]["dependency_parse"] = {}
            corpus[count]["dependency_parse"] = dependency_parse
            corpus[count]["synonyms"] = {}
            corpus[count]["synonyms"] = Synonym
            corpus[count]["hypernyms"] = {}
            corpus[count]["hypernyms"] = Hypernym
            corpus[count]["hyponyms"] = {}
            corpus[count]["hyponyms"] = Hyponym
            corpus[count]["meronyms"] = {}
            corpus[count]["meronyms"] = Meronym
            corpus[count]["holonyms"] = {}
            corpus[count]["holonyms"] = Holonym
            corpus[count]["ner_tag"] = {}
            corpus[count]["ner_tag"] = str(dict(ner_tag))
            corpus[count]["head_word"] = {}
            corpus[count]["head_word"] = Heads[0]
            corpus[count]["file_name"] = {}
            corpus[count]["file_name"] = file[len(file_loc):]

        outputName = file[len(file_loc)]        
        json_object = json.dumps(corpus, indent = 4) 
        with open(outputName, "w") as f:
            f.write(json_object)
Exemplo n.º 24
0
     if i not in stop_words
 ]  # Tokenizing sentences into words
 # Lemmatizing the words to extract lemmas as features
 lemma = [lemmatizer.lemmatize(word) for word in tokenized_text]
 stemmed = [porter.stem(word)
            for word in tokenized_text]  # Stemming the words
 # POS tagging the words to extract POS features
 tagged = nltk.pos_tag(tokenized_text)
 parse, = dep_parser.raw_parse(sentence)
 # Dependency parsing to parse tree based patters as features
 dependency_parse = list(parse.triples())
 # best_sense = [lesk(sentence, word) for word in tokenized_text] #LESK to extract best sense of a word
 tokenized_text_ner = nltk.word_tokenize(
     sentence)  # Tokenizing sentences into words
 try:
     ner_tag = ner_tagger.tag(tokenized_text_ner)
 except:
     ner_tag = ner_tagger.tag(tokenized_text)
 head_list = []
 striped_sentence = sentence.strip(" '\"")
 if striped_sentence != "":
     dependency_parser = dep_parser.raw_parse(striped_sentence)
     parsetree = list(dependency_parser)[0]
     head_word = ""
     head_word = [
         k["word"] for k in parsetree.nodes.values() if k["head"] == 0
     ][0]
     if head_word != "":
         head_list.append([head_word])
     else:
         for i, pp in enumerate(tagged):
Exemplo n.º 25
0
    driver.execute_script("arguments[0].click();",
                          button2.find_elements_by_class_name('_ni9axhe')[1])

    all_text = ""
    #add description details
    for paragraph in button.find_elements_by_class_name('_6z3til'):
        all_text = all_text + " " + paragraph.text

    #add neighborhood info
    for paragraph in button2.find_elements_by_class_name('_6z3til'):
        all_text = all_text + " " + paragraph.text

    print("Tagging text...")
    #get all locations/cities from text on website
    the_list = [
        x[0] for x in ner_tagger.tag(all_text.split())
        if x[1] == 'LOCATION' or x[1] == 'CITY'
    ]

    #find the borough
    borough = find_borough(the_list)
    print("\n%s is in %s" % (url, borough.upper()))

    #get crime data for borough
    links_to_crime[url] = get_crime_data(borough)

#sort the listings by crimes that have occurred
ordered_listings = sorted(links_to_crime.items(), key=lambda x: x[1])
print("\n\nFINAL ORDER IS (also found in ordered.txt): ", ordered_listings)
write_listings_ordered(ordered_listings)
Exemplo n.º 26
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 16 00:28:43 2020

@author: mingxi
"""

from nltk.parse import CoreNLPParser

pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')

# extract pos tags for the Tedlium corpus
ted_text = open(
    '/Users/mingxi/Desktop/TEMP/DISS/Grammar/TEDLIUM_release2/stm_processed_final.txt',
    'r').read()
ted_pos = []
for i in ted_text.split('.'):
    ted_pos.append(list(pos_tagger.tag((i.capitalize() + '.').split())))

ted_pos2 = []
for i in ted_pos:
    if i[0][0] != '.':
        for j in i:
            ted_pos2.append(j[1])

out = open('/Users/mingxi/Desktop/TEMP/DISS/Grammar/ted_pos.txt', 'w')
out.write('\n'.join(ted_pos2))
out.close()
Exemplo n.º 27
0
def process(text):
    pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
    tags = pos_tagger.tag(text)
    return tags
Exemplo n.º 28
0
# query = "Is Kubrick a director?"
# query = "Was Birdman the best movie in 2015?"
query = "Who directed Schindler's List?"
# query = "Who won the oscar for best actor in 2005?"
# query = "Which movie won the oscar in 2000?"
# query = "Who directed the best movie in 2010?"
# query = "Did Allen direct Mighty Aphrodite?"
finalquery = query
# queryNers = ner_tagger.tag((query.split()))
# for i in queryNers:
# 	if i[1]=="DATE":
# 		query = query.replace(i[0],"Date")
# print(query)

personName,movieName =[],[]
pos_tags=list(pos_tagger.tag(query.split()))
pos_tags.append(('last','$$$'))
print(pos_tags)
nnpTags,year = [],0
for i,k in enumerate(pos_tags):
	if pos_tags[i+1][1] == '$$$':
		break
	elif pos_tags[i][1]=='NNP' and pos_tags[i+1][1]!='NNP':
		nnpTags.append(pos_tags[i][0])
	elif pos_tags[i][1]=='NNP' and pos_tags[i+1][1]=='NNP':
		nnpTags.append(pos_tags[i+1][0])
	if pos_tags[i][1]=='CD':
		year=pos_tags[i][0]
print(nnpTags)

for i in nnpTags:
        for governor, dep, dependent in parse.triples()] for parse in parses])
print(
    "\nExpected: [[(('What', 'WP'), 'cop', ('is', 'VBZ')), (('What', 'WP'), 'nsubj', ('airspeed', 'NN')), (('airspeed', 'NN'), 'det', ('the', 'DT')), (('airspeed', 'NN'), 'nmod', ('swallow', 'VB')), (('swallow', 'VB'), 'case', ('of', 'IN')), (('swallow', 'VB'), 'det', ('an', 'DT')), (('swallow', 'VB'), 'amod', ('unladen', 'JJ')), (('What', 'WP'), 'punct', ('?', '.'))]]\n"
)

# Tokenizer
parser = CoreNLPParser(url='http://localhost:9000')
print(list(parser.tokenize('What is the airspeed of an unladen swallow?')))
print(
    "\nExpected: ['What', 'is', 'the', 'airspeed', 'of', 'an', 'unladen', 'swallow', '?']\n"
)

# POS Tagger
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
print(
    list(pos_tagger.tag(
        'What is the airspeed of an unladen swallow ?'.split())))
print(
    "\nExpected: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]\n"
)

# NER Tagger
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
print(
    list(
        ner_tagger.tag(
            ('Rami Eid is studying at Stony Brook University in NY'.split()))))
print(
    "\nExpected: [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]\n"
)
Exemplo n.º 30
0
    def read_from_textgrid(self, file_list):
        pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos')
        lex_table = read_lex_table(lex_table_path)
        variant_match = dict()
        for r in zip(lex_table['word_variant'], lex_table['word_standard'],
                     lex_table['word_vars'], lex_table['POS_tag']):
            # dict with variant as key.
            # if no match tag the thing
            v_pattern = compile_pattern(r[0])
            if v_pattern not in variant_match.keys():
                variant_match[v_pattern] = []
            else:
                print(v_pattern)  # add it? no
            variant_match[v_pattern].append(r)
        gehen_variants = set()
        locations = lex_table.loc[lex_table['word_lemma'] == 'gehen']
        for gehen_var in zip(locations['word_variant'],
                             locations['word_vars']):
            if "SAF5" not in gehen_var[1]:
                g_pattern = compile_pattern(gehen_var[0])
                gehen_variants.add(g_pattern)
        # for gehen_row in lex_table.loc[lex_table['word_lemma'] == 'gehen']['word_variant']:
        #     # check the word_vars
        #     if not any("SAF5" in wv for wv in lex_table.loc[lex_table['word_variant'] == gehen_row]['word_vars']):
        #         g_pattern = compile_pattern(gehen_row)
        #         gehen_variants.add(g_pattern)
        for each_file_name in file_list:
            # now combine the files of the same speakers
            print(each_file_name)
            interval_num = 0
            file_path = self.tg_path + each_file_name
            try:
                file_textgrid_obj = textgrid.TextGrid.fromFile(file_path)
            except UnicodeDecodeError:
                print(each_file_name +
                      ': the encode is weird, not utf-8 or ansi')

            tier_list = file_textgrid_obj.tiers

            for each_tier in tier_list:
                if each_tier.name == 'SWG':  # read from swg tier
                    tier_swg = each_tier
                    intervals_swg = tier_swg.intervals

            try:
                clauses = []
                clause_annotation = []
                time_segment = dict()
                skip = False
                begin_tag = ''
                for each_annotation in intervals_swg:
                    annotation_mark = each_annotation.mark
                    beg_hms = timestamp_convert(each_annotation.minTime)
                    if not annotation_mark.strip(): continue
                    punct = [',', '.', '!', '?']  # maybe just . ! ?
                    tokens = annotation_mark.split()
                    time_segment[beg_hms] = tokens
                    for token in tokens:
                        if any(p in token for p in punct
                               ):  # function that turn segments into clauses
                            if all(c in string.punctuation for c in token
                                   ):  # this is for token like ... --- and ???
                                if not clause_annotation:
                                    time_stamp = beg_hms
                                clause_annotation.append(token)
                                if len(
                                        token
                                ) > 3 or token in punct:  # why do I do this again, still don't know
                                    clause_annotation.append(time_stamp)
                                    clauses.append(clause_annotation)
                                    clause_annotation = []
                                continue

                            word_punct_split = re.findall(
                                r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]",
                                token,
                                re.UNICODE)  # separate word with punctuation

                            for wp in word_punct_split:  # maybe to split annotations into clauses
                                if not clause_annotation:
                                    time_stamp = beg_hms
                                clause_annotation.append(wp)
                                if all(c in punct for c in wp):
                                    clause_annotation.append(time_stamp)
                                    clauses.append(clause_annotation)
                                    clause_annotation = []
                        else:
                            if not clause_annotation:
                                time_stamp = beg_hms
                            clause_annotation.append(token)
                for cl in clauses:
                    if '[ANT]' in cl or '[REL]' in cl:
                        # print("clause", cl)
                        beg_hms = cl[-1]
                        # print("time", beg_hms)
                        cl = cl[:-1]
                        # print("cl", cl)
                        if cl[0] not in time_segment[
                                beg_hms]:  # closer  remaining is the punctuation problem
                            segment_annotation = []
                            for token in time_segment[beg_hms]:
                                segment_annotation += re.findall(
                                    r"[^\w\d\s,.!?]*\w+[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*\w*[^\w\d\s,.!?]*|[^\w\s]",
                                    token, re.UNICODE)
                            if cl[0] not in segment_annotation:
                                print(segment_annotation)
                                print(cl[0])
                        else:
                            segment_annotation = time_segment[beg_hms]
                        sym_seq = segment_annotation.index(cl[0]) + 1
                        words_std = []
                        ddm_tags = []
                        pos_sent = []

                        # get ddm
                        for i, word in enumerate(cl):
                            if word:  # empty word check
                                # match w with word_variant
                                std_list = set()
                                ddm_list = set()
                                pos_list = set()
                                no_match = True
                                rel = False
                                # check for var: REL
                                if i + 1 < len(
                                        cl):  # make sure next word exist
                                    w_next = cl[i + 1]
                                    if "[REL]" in w_next:
                                        rel = True
                                        if "wo" in word:
                                            rel_var = " RELd"
                                        elif "als" in word or word.startswith(
                                                "d") or word.startswith(
                                                    "wel") or word.startswith(
                                                        "jed"):
                                            rel_var = " RELs"
                                        elif ("was" in word) or (
                                                "wie" in word) or ("wer"
                                                                   in word):
                                            rel_var = " RLOs"
                                        else:
                                            rel_var = " UNK"
                                for p in variant_match.keys():
                                    if p.search(word) is not None:  # .lower()
                                        no_match = False
                                        for values in variant_match[p]:
                                            swg = values[0].replace("*", "")
                                            # rum[ge]draat
                                            if "ge" in swg and "ge" not in word:
                                                swg = swg.replace(
                                                    "ge", "g"
                                                )  # for gespielt gspielt
                                            std = values[1].replace("*", "")
                                            std_list.add(std)
                                            if isinstance(
                                                    values[2], float
                                            ) and math.isnan(
                                                    values[2]
                                            ):  # check for empty var_code
                                                pass  # do nothing
                                            else:
                                                ddm_list.add(
                                                    values[2])  # should be set
                                            if isinstance(
                                                    values[3],
                                                    float) and math.isnan(
                                                        values[3]):
                                                pos_list.add('*')
                                            else:
                                                pos_list.add(values[3])
                                if no_match:
                                    standard = word
                                    ddm = "*"
                                    pos = pos_tagger.tag([word])[0][1]
                                    if "$" in pos:
                                        pos = "*"
                                else:
                                    standard = " ".join(std_list)
                                    ddm = " ".join(str(d) for d in ddm_list)
                                    if any("SAF5" in d for d in ddm_list):
                                        for g_pattern in gehen_variants:
                                            if g_pattern.search(
                                                    word) is not None:
                                                print(ddm)
                                                print(word)
                                                print(
                                                    "!"
                                                )  # gegang* [ge]gang* will be taged as SAF5
                                                # k as prefix
                                                ddm = ddm.replace("SAF5d", "")
                                                ddm = ddm.replace("SAF5s", "")
                                                print(ddm)
                                    pos = " ".join(str(p) for p in pos_list)
                                if rel:
                                    if ddm != "*":
                                        ddm = ddm + rel_var
                                    else:
                                        ddm = rel_var
                                    ddm = ddm.strip()
                            words_std.append(standard)
                            ddm_tags.append(ddm)
                            pos_sent.append(pos)
                        # columns
                        self.output_as_csv(
                            each_file_name[each_file_name.rfind("_") + 1:-9],
                            beg_hms, sym_seq, " ".join(cl), " ".join(ddm_tags),
                            " ".join(pos_sent))
            except AttributeError as e:
                print(each_file_name +
                      ': tier words is empty or does not exist ')
                traceback.print_tb(e.__traceback__)