Python tree2conlltags示例，nltk.chunk.tree2conlltags Python示例

示例#1

0

显示文件

文件： Chunker.py 项目： ilius/hazm

def tree2brackets(tree):
	str, tag = '', ''
	for item in tree2conlltags(tree):
		if item[2][0] in {'B', 'O'} and tag:
			str += tag +'] '
			tag = ''

		if item[2][0] == 'B':
			tag = item[2].split('-')[1]
			str += '['
		str += item[0] +' '

	if tag:
		str += tag +'] '

	return str.strip()

示例#2

0

显示文件

文件： keyphrase_extraction.py 项目： titipata/keyphrase_extraction

def generate_candidate(texts, method='word', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence) # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_) # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use either 'word' or 'phrase' in method")
    return candidates

示例#3

0

显示文件

文件： transformers.py 项目： yokeyong/atap

 def extract_keyphrases(self, document):
     """
     For a document, parse sentences using our chunker created by
     our grammar, converting the parse tree into a tagged sequence.
     Yields extracted phrases.
     """
     for sents in document:
         for sent in sents:
             sent = self.normalize(sent)
             if not sent: continue
             chunks = tree2conlltags(self.chunker.parse(sent))
             phrases = [
                 " ".join(word for word, pos, chunk in group).lower()
                 for key, group in groupby(
                     chunks, lambda term: term[-1] != 'O'
                 ) if key
             ]
             for phrase in phrases:
                 yield phrase

示例#4

0

显示文件

文件： keyphrase.py 项目： bbengfort/minke

def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]

示例#5

0

显示文件

文件： 01-ntlk-pyramid.py 项目： SamVanhoutte/python-musings

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conllstr, conllstr2tree, conlltags2tree, tree2conlltags
import nltk

text = "Fly me from Seattle to Tampa"
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
ner_tree = ne_chunk(tagged_tokens)
print(ner_tree)
iob_tagged = tree2conlltags(ner_tree)
print(iob_tagged)

示例#6

0

显示文件

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent1 = preprocess(ex1)
sent2 = preprocess(ex2)

cs1 = cp.parse(sent1)
cs2 = cp.parse(sent2)
print(cs1)
print(cs2)

iob_tagged1 = tree2conlltags(cs1)
pprint(iob_tagged1)

iob_tagged2 = tree2conlltags(cs2)
pprint(iob_tagged2)

ne_tree1 = nltk.ne_chunk(pos_tag(word_tokenize(ex1)))
print(ne_tree1)

ne_tree2 = nltk.ne_chunk(pos_tag(word_tokenize(ex2)))
print(ne_tree2)

doc1 = nlp(ex1)
print('Named Entities for scentence1:')
pprint([(X.text, X.label_) for X in doc1.ents])

示例#7

0

显示文件

 def chunkparser(self, pattern='NP: {<DT>?<JJ>*<NN>}'):
     cp = nltk.RegexpParser(pattern)
     cs = cp.parse(self.sent)
     iob_tagged = tree2conlltags(cs)
     self.iob_tagged = iob_tagged

示例#8

0

显示文件

文件： ner_nltk.py 项目： Sarfarazsajjad/Plantagg-NLP

def fn_preprocess(art):
    art = nltk.word_tokenize(art)
    art = nltk.pos_tag(art)
    return art


art_processed = fn_preprocess(article)

results = ne_chunk(art_processed)

# for x in str(results).split('\n'):
#     if '/NN' in x:
#         print(x)

pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(art_processed)
# print(cs)

iob_tagged = tree2conlltags(cs)
# pprint(iob_tagged)

namedEntities = []
for word, pos, ner in iob_tagged:
    namedEntities.append(ner)
#     print(word, pos, ner)

print('Named Entites in Document')
print(len(namedEntities))

示例#9

0

显示文件

文件： crfsuite.py 项目： gergoszita/text-classification-benchmark

    sent = ""
    labels = []
    # try:
    for word in tsvin:
        word = word.split("\t")
        word = [w.replace("\n", "") for w in word]

        if word[0] == '':
            splitted = sent.split(" ")
            splitted = [str.strip(w) for w in splitted]
            # splitted = [re.sub('[^A-Za-z0-9]+', '', w) for w in splitted]
            splitted = [w for w in splitted if len(w) >= 1]
            # print splitted
            X_test_final.append(
                sent2features((tree2conlltags(ne_chunk(pos_tag(splitted))))))
            y_test_final.append(labels)
            sent = ""
            labels = []
        else:
            # if len(word[0].split(" ")) > 1:
            # print word[0].split(" ")
            sent = sent + " " + str.strip(word[0])
            labels.append(word[1])
    # except:
    #     print

# with open(CONST_WIKI_ALL,'rb') as tsvin, open('new.csv', 'wb') as csvout:
#     tsvin = csv.reader(tsvin, delimiter='\t')
#     for word in tsvin:
#         print word

示例#10

0

显示文件

def find_elements(text,
                  full=False,
                  trim=True,
                  low_trim_limit=2,
                  high_trim_limit=2000):
    sent = nltk.pos_tag(nltk.word_tokenize(text))
    elements = dict()
    if full:  #do all nouns
        for x in sent:
            if x[1] == "NN" or x[1] == "NNS" or x[1] == "NNP" or x[
                    1] == "NNPS" or x[1] == "PRP":
                elements[x[0].lower()] = 0
    else:  #do only NE + extra
        for x in sent:
            if x[1] == "PRP":
                elements[x[0].lower()] = 0

        pattern = 'NP: {<DT>?<JJ>*<NN>}'
        cp = nltk.RegexpParser(pattern)
        cs = cp.parse(sent)
        iob_tagged = tree2conlltags(cs)
        parsed = pformat(iob_tagged)
        parsed = ast.literal_eval(parsed)
        tempString = ""
        for x in parsed:
            if x[2] == 'B-NP' or x[2] == 'I-NP':
                tempString = tempString + x[0].lower() + " "
            if x[2] == 'O' and len(tempString) > 0:
                tempString = tempString.rstrip()
                tempString = re.sub(
                    "^a ", "",
                    re.sub(
                        "^an ", "",
                        re.sub(
                            "^no ", "",
                            re.sub("^this ", "",
                                   re.sub("^the ", "", tempString)))))
                elements[tempString.rstrip().lower()] = 0
                tempString = ""

        doc = nlp(text)
        parsed = pformat([(X.text, X.label_) for X in doc.ents])
        parsed = ast.literal_eval(parsed)
        for x in parsed:
            if x[1] == 'PERSON' or x[1] == 'ORG' or x[1] == 'PRODUCT' or x[
                    1] == 'LOC' or x[1] == 'FAC':
                tempString = x[0].lower().replace('a ', '').replace(
                    'an ',
                    '').replace('no ',
                                '').replace('this ',
                                            '').replace('the ',
                                                        '').replace('\n', '')
                elements[tempString] = 0

    if trim:
        text = text.lower()
        for x in elements.keys():
            elements[x] = my_count(text, x)
        elements = {k: v for k, v in elements.items() if v > low_trim_limit}
        elements = {k: v for k, v in elements.items() if v <= high_trim_limit}

    pprint(elements)
    return elements

示例#11

0

显示文件

文件： read_emails.py 项目： siva600/chatbot-1

 def __init__(self, train_sents, *args, **kwargs):
     tag_sents = [tree2conlltags(sent) for sent in train_sents]
     train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
     self.tagger = ConsecutiveNPChunkTagger.train(train_chunks, *args, **kwargs)

示例#12

0

显示文件

文件： NLP_DNER.py 项目： exchhattu/BiomedicaLorHealthCare-NLP

 def tag_bio(self): 
   #print(ne_chunk(pos_tag(self._ts_abs_word_tokens))) 
   iob_tagged = tree2conlltags(ne_chunk(pos_tag(self._ts_abs_word_tokens)))
   print(iob_tagged)

示例#13

0

显示文件

文件： nltk_ner.py 项目： sambit9238/name_entity_recognizer

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
sentence = '''
        Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very
        close to the Manhattan Bridge which is visible from the window.
        '''
print(tree2conlltags(ne_chunk(pos_tag(word_tokenize(sentence)))))

示例#14

0

显示文件

文件： CRFChunkParser.py 项目： VitalyRomanov/LanguageTools

        }

        return list(features.values())
 
 
if __name__ == "__main__":

    # transformed = [list(map(lambda x: ((x[0], x[1]), x[2]), s)) for s in chunked_sents]
    # random.shuffle(transformed)
    # train_sents = transformed[:int(len(transformed) * 0.9)]
    # test_sents = transformed[int(len(transformed) * 0.9 + 1):]

    # from nltk.stem.snowball import SnowballStemmer

    file_path = sys.argv[1]
    chunked_sents = [tree2conlltags(chunk.conllstr2tree(s)) for s in open(file_path).read().strip().split("\n\n")]
    random.shuffle(chunked_sents)
    train_sents = []#chunked_sents[:int(len(chunked_sents) * 0.7)]
    test_sents = chunked_sents[int(len(chunked_sents) * 0.7 + 1):]

    ### CRF Chunker

    chunker = CRFChunkParser(chunked_sents=train_sents, model_file="russian_chunker.crf")
    print(chunker.evaluate([conlltags2tree(s) for s in test_sents]))


    # from nltk.tag.crf import CRFTagger
    # chunker = CRFTagger(feature_func=feature_detector)

    # chunker.set_model_file("russian_chunker.crf")
    # chunker.train(train_sents, "russian_chunker.crf")

示例#15

0

显示文件

    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    # print list
    for x in unique_list:
        print(x)


##### Processing on EBS Input Data file to extract dump of unique keywords

iob_tagged = []
for i in range(0, len(df['PMHD_TA004_SYS_T_DES'])):
    #print(i)
    ne_tree = ne_chunk(pos_tag(word_tokenize(df['PMHD_TA004_SYS_T_DES'][i])))
    iob_tagged.append(tree2conlltags(ne_tree))

s1 = []
for i in range(0, len(iob_tagged)):
    s1.append([i[0] for i in iob_tagged[i]])

s2 = []
for i in range(0, len(iob_tagged)):
    s2.append([i[1] for i in iob_tagged[i]])

s3 = []
for i in range(0, len(iob_tagged)):
    s3.append([i[2] for i in iob_tagged[i]])

s01 = []
for i in range(0, len(s1)):

示例#16

0

显示文件

文件： mynltk.py 项目： samurainote/NLP_Preprocessing_Toolkit

 def chunking(sentence):
     from nltk.chunk import conlltags2tree, tree2conlltags
     iob_tagged = tree2conlltags(sentence)
     chunked_tree =conlltags2tree(iob_tagged)
     return chunked_tree

示例#17

0

显示文件

文件： Doc-Vocab.py 项目： AdityaPrithvinath/MSTR

def namedEntityRecognition(pos):
    chunked_token = ne_chunk(pos)
    named_entity = tree2conlltags(chunked_token)
    return named_entity

示例#18

0

显示文件

文件： Clean_hypothesis_extraction.py 项目： felipemontano10/NLP-Causality-Extraction

def rm_breaks(text, beta):

    #Convert to lower
    text = text.lower()
    #Remove commas
    text = text.replace(',', '')
    #Remove DOIs
    text = re.sub(r'\d+\.\d+/\w+', '', text)
    text = re.sub(r'doi:*', '', text)
    #Replace 'hypothesis 1' with 'h1'
    text = text.replace('hypotheses', 'hypothesis')
    text = re.sub(r'hypothesis (?=\d+)', 'h', text)
    #Remove numbers that dont have a character immediately before them (since H0 indicates hypothesis)
    text = re.sub(r'\W+\d+', '', text)
    text = re.sub(r'\d{2,4}', '', text)
    #Replace jstor link with 'jstor' placeholder, then delete
    text = re.sub(r'https?://.+', 'jstor.', text)
    # text = re.sub(r'\S+\.jstor\.\S+', 'jstor.', text))
    text = re.sub(r'\.{2,}|:', '', text)
    text = re.sub(r'this\scontent.+', '', text)
    #Delete jstor placeholder
    check = re.sub(r'.*jstor.*', '', text)
    if check != '':
        text = re.sub(r'.*jstor.*', '', text)
    #Remove word interruptions
    text = re.sub(r'-\s*\n\s*', '', text)
    #Remove line breaks
    text = re.sub(r'\n', '', text)
    #Do NER and remove sentences with too many named entities
    sent = preprocess(text)
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(sent)
    iob_tagged = tree2conlltags(cs)
    Owords = 0
    wordCount = 0
    maintext = []
    holder = []
    for i in iob_tagged:
        holder.append(i[0])
        wordCount += 1
        if i[0] == '.':
            score = Owords / wordCount
            if 'hypothesis' in holder:
                maintext += holder
                holder.clear()
            elif 'jstor' in holder:
                Owords = 0
                wordCount = 0
                holder.clear()
                continue
            elif score >= beta:
                maintext += holder
            Owords = 0
            wordCount = 0
            holder.clear()
        if i[2] == 'O':
            Owords += 1
    if maintext != []:
        maintext = functools.reduce(lambda a, b: a + ' ' + b, maintext)
    return maintext

示例#19

0

显示文件

文件： sen_pro.py 项目： Lightenit/styleTransfer

def senten_tag(sentence):
    ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
    iob_tagged = tree2conlltags(ne_tree)
    return iob_tagged

示例#20

0

显示文件

        if(data is None or data['message'] is None or data['message'] == ""):
            continue
        msg = str(data['message'])
        msg = msg.strip()
        if(len(msg) == 0 or  english_ch.search(msg) == None):
            continue
        tokens = nltk.word_tokenize(msg)
        tokens = remove_blanc(tokens)
        tokens = remove_special(tokens)

        flag = "FALSE"
        for t in tokens:
            if(d.check(t)):
                flag = "TRUE"
                break
        x = tree2conlltags(ne_chunk(pos_tag(word_tokenize(msg))))
        nerf  = "N"
        for i in x:
            if(len(i) > 2 and not ("B-" in i[2] or "I-" in i[2] )):
                nerf  = "S"
                break
        if(flag == "TRUE" or nerf == "N"):
            if(date[0] not in transactions_date_wise):
                transactions_date_wise[date[0]] = 0
            transactions_date_wise[date[0]] = transactions_date_wise[date[0]] + 1
            textual_transactions = textual_transactions + 1
    except:
        continue        
f.close()

outputfile.write("DATE #TEXTUAL_TRANSACTIONS \n")

示例#21

0

显示文件

文件： parser.py 项目： zemlni/GAT

def IOB_Tagging(t):
    iob_tagged = tree2conlltags(t)
    return iob_tagged

示例#22

0

显示文件

文件： resume_parser.py 项目： sekarkrishnan89/Resume-Parser

from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import stopwords
import os
import re

filepath = "Enter file path"
fin = open(filepath, 'r')
fout =  open('out.txt', 'w' )
text = fin.read()
text = re.sub(r'[^\w\s]',' ',text)        
sentence=sent_tokenize(text)
for x in sentence:
    words=word_tokenize(x)
    tagged_pos=pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged_pos, binary=False)
    ne_tagged=(tree2conlltags(namedEnt))
    for ne in ne_tagged:
        ner=(ne[-1])
        ner1=str(ner)
    for tag in range(3):
        if tag == 0:
            gram = ("Nametag: {(<VBP>).*?(<JJ>?<NNP>+|<NNP>+)}")
        if tag == 1:
            gram = ("Datetag: {<CD><CD|JJ><CD>}")
        if tag == 2:
            gram = ("Qualificationtag: {<NNP>+<IN.*><NNP>} ")
            
        chunkParser = nltk.RegexpParser(gram)
        tree = chunkParser.parse(tagged_pos)
        iob_tagged=(tree2conlltags(tree))
        for iob in iob_tagged:

示例#23

0

显示文件

文件： NERCRF.py 项目： badri-thinker/textextract

ts = " Agent Name david member number 45678"
ts3 = "123467 is  davids member no"
ts2 = " mark and john are working at Google"

test = (nltk.pos_tag(word_tokenize(ts)),
        np.where(model.encode([" ".join(ts)]) >= 0, 'p',
                 'n').astype('|S1').tostring().decode('utf-8'))
print("POS tags output by nltk")
print(test[0])
#test = [('member',) ,('number', ),('is',), ('9860300',)]
X_test = extract_features(test)
ans = fcrf.predict_single(X_test)
print(ts)
print("NER tags recognized by CRF")
print(ans)
# compare ner tags output by stanford ner, nltk and spaCy
tokenized_text = word_tokenize(ts)
ner_st = nerst.tag(tokenized_text)
print("stanford ner tags")
print(ner_st)
pos_nltk = nltk.pos_tag(tokenized_text)
print("nltk tags")
print(pos_nltk)
print(tree2conlltags(ne_chunk(pos_nltk)))
nlp = spacy.load('en_core_web_sm')
doc = nlp(ts)
print("spacy ner tags")
print([(X.text, X.label_) for X in doc.ents])

exit()

示例#24

0

显示文件

text1 = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
text2="Please advise on the options the deceased clients wife has in relation to this pension" \
     "   She wishes to exercise ARF option if available "
text="Hi I was trying to register online but I was n t recognised " \
     "  My  France number is 4824461      " \
     "Looking to register on Pension Planet Robert Manning" \
     "   but Irish Ronnie Gardner website ca n t find my details        " \
     "Richard Wade "
text = 'How can I pay my car renewal'
tokenized_text = word_tokenize(text)
ner_st = st.tag(tokenized_text)
print(ner_st)

pos_st = post.tag(tokenized_text)
print(pos_st)
exit()
pos_nltk = nltk.pos_tag(tokenized_text)
print(pos_nltk)

blob = TextBlob(text)
print(blob.tags)
print("tree stanford\n")
print("type of chunk", type(ne_chunk(pos_st)))

print("type of tree", len(tree2conlltags(ne_chunk(pos_st))))
print("tree nltk\n")
print(tree2conlltags(ne_chunk(pos_nltk)))
print("tree blob\n")
print(ne_chunk(pos_nltk))
print(tree2conlltags(ne_chunk(blob.tags)))
exit()

示例#25

0

显示文件

文件： Putting Classifier and NER Together.py 项目： jesse-michael-han/arxivDownload

# Prepare and print metrics for the normal metrics
OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True)
y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker)
print(metrics.classification_report(y_true, predicted))
# -

# An example of a user fed definition
chunked = chunker.parse(pos_tag(word_tokenize(Def[0])))
D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0]
' '.join([d[0] for d in D])

art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml')
p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] 
p_vec = count_vect.transform(p_lst)
preds = clf.predict(p_vec)

for k,p in enumerate(p_lst):
    print(k,preds[k],p[:100])
    print('------')

chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63]))))
for tok in chunk:
    print('{:15} {:>10} '.format(tok[0], tok[2]))

with open('../PickleJar/chunker.pickle', 'wb') as chunker_f:
    pickle.dump(chunker, chunker_f)

with open('data/vectorizer.pickle', 'wb') as token_f:
    pickle.dump(, token_f)

示例#26

0

显示文件

文件： assign_features.py 项目： xiaoyue10131748/data_policy_analyzer

def write(filename, predictor):
    sentence = read_sentence(filename)
    for s in sentence:
        sentence_list, label_list = process_sentence(s)
        sen = mergeWords(sentence_list)
        # print(sen)

        #####assign pos#############################################3
        pos_list = []
        # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen))))
        truple = tree2conlltags(ne_chunk(pos_tag(sentence_list)))
        # the truple contains word, pos, ner-label
        for item in truple:
            pos_list.append(item[1])

        ################get words lemma and stem######################
        wordnet_lemmatizer = WordNetLemmatizer()
        lemma_list = []
        for word in sentence_list:
            lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        stem_list = []
        lancaster = LancasterStemmer()
        for word in sentence_list:
            stem_list.append(lancaster.stem(word))
        # print(stem_list)

        #####assign consituency parent pos############################
        pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree(
            sentence_list, predictor)
        # print("=========pos===")
        # print(len(sentence_list))
        # print(len(chunk_position))
        # 追加一行空行
        sentence_list.append(" ")
        label_list.append(" ")
        pos_list.append(" ")
        pos_parent_list.append(" ")
        right_sublings_list.append(" ")
        chunk_position.append(" ")
        lemma_list.append(" ")
        stem_list.append(" ")
        left_sublings_list.append(" ")

        data = {}
        data["word"] = sentence_list
        data["label"] = label_list
        data["pos"] = pos_list
        data["chunk"] = pos_list
        data["pos_parent"] = pos_parent_list
        data["right_sublings_list"] = right_sublings_list
        data["chunk_position"] = chunk_position
        data["lemma_list"] = lemma_list
        data["stem_list"] = stem_list
        data["left_sublings_list"] = left_sublings_list
        df = pd.DataFrame(data)

        # to_filename = "word.csv"
        # df.to_csv(to_filename)
        to_file = filename.split(".tsv")[0]
        to_file1 = to_file + "_feature_v1" + ".tsv"
        df.to_csv(to_file1,
                  sep='\t',
                  index=False,
                  header=False,
                  encoding="utf8",
                  mode='a')

示例#27

0

显示文件

sentences = [
    "John is a man. He walks", "John and Mary are married. They have two kids",
    "In order for Ravi to be successful, he should follow John",
    "John met Mary in Barista. She asked him to order a Pizza"
]


def gender(word):
    return classifier.classify(feature(word))


for sent in sentences:
    chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)),
                           binary=False)
    stack = []
    print(sent)
    items = tree2conlltags(chunks)  #iob tagging
    for item in items:
        if item[1] == 'NNP' and (item[2] == 'B-PERSON' or item[2] == 'O'):
            stack.append((item[0], gender(item[0])))
        elif item[1] == 'CC':
            stack.append(item[0])
        elif item[1] == 'PRP':
            stack.append(item[0])
    print("\t {}".format(stack))

items

print(chunks)

示例#28

0

显示文件

文件： dialogue_manager.py 项目： galaxy24/Simple_chat_bot

    def generate_weather_answer(self, question):
        """
        Generate weather forecast for a selected city.
        
        At first try to extract city from user request. If not possible, then generate usual answer.
        Connects to openweathermap and gets forecast, then generates plot of temperature
        in Celsius and Fahrenheit; also shows unique weather conditions.
        """
        # remove previous image, as it isn't needed anymore
        for i in glob.glob(os.path.join(os.getcwd(), '*.png')):
            os.remove(i)

        good_symbols_re = re.compile('[^a-zA-Z -]')
        question_cleaned = good_symbols_re.sub('', question)

        # Extract entities.
        tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(question.title()))))
        cities = [i[0] for i in tagged if i[1] == 'NNP']
        city = ''
        for c in cities:
            data = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(c)).json()
            if data['cod'] == '200':
                city = c
                break
        if city == '':
               return self.generate_usual_answer(question)
        else:
            forecast = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(city)).json()
            if forecast['message'] == 'city not found':
                return "I don't know this city!"

            # Generate temperature and date lists for plotting
            date_list = []
            temp_list_c = []
            temp_list_f = []

            for reading in forecast['list']:
                date = datetime.fromtimestamp(int(reading['dt']))
                temperature_c = reading['main']['temp'] - 273.15
                temperature_f = reading['main']['temp'] * 9 / 5 - 459.67
                date_list.append(date)
                temp_list_c.append(temperature_c)
                temp_list_f.append(temperature_f)

            # make chart
            fig, ax = plt.subplots()
            ax.plot_date(date_list, temp_list_c, '-', label='Celsius')
            ax.plot_date(date_list, temp_list_f, '-', label='Fahrenheit')
            ax.grid(True)

            plt.xticks(rotation=30)
            plt.yticks(range(int(min(temp_list_c)) - 1, int(max(temp_list_f) + 1), 5))
            dtFmt = mdates.DateFormatter('%m/%d')
            ax.xaxis.set_major_formatter(dtFmt)
            plt.title('Temperature in {0}'.format(city))
            plt.legend()
            # save image, so it can be sent to user
            plt.savefig('plot.png')

            # List of possible unique weather conditions
            weather = ', '.join(list(set([i['weather'][0]['description'] for i in forecast['list']])))

            return 'Possible weather in the next few days: {0}.;{1}'.format(weather, 'plot.png')

示例#29

0

显示文件

def getSyntaxInfo(sentence):
    tags = pos_tag(sentence.split())
    ne_tree = ne_chunk(tags)
    ne_tagged = tree2conlltags(ne_tree)
    syntax_info = []
    caps_range = set(range(ord('A'), ord('Z') + 1, 1))
    for i in range(len(tags)):
        tag = tags[i]
        ne_tag = ne_tagged[i][2]
        tag_no = tagset.index(tag[1])
        sentiment_score = [0, 0, 0]
        wordnetTag = getWordnetTag(tag[1])
        if wordnetTag is None:
            synset = wn.synsets(tag[0])
            if len(synset) == 0:
                synset = None
            else:
                synset = synset[0]
                sentiSynset = swn.senti_synset(synset.name())
                sentiment_score = [
                    sentiSynset.pos_score(),
                    sentiSynset.neg_score(),
                    sentiSynset.obj_score()
                ]
        else:
            synset = wn.synsets(tag[0], pos=wordnetTag)
            if len(synset) == 0:
                synset = None
            else:
                synset = synset[0]
                sentiSynset = swn.senti_synset(synset.name())
                sentiment_score = [
                    sentiSynset.pos_score(),
                    sentiSynset.neg_score(),
                    sentiSynset.obj_score()
                ]
        start_caps = int(ord(tag[0][0]) in caps_range)
        allcaps = 1
        for c in tag[0]:
            if ord(c) not in caps_range:
                allcaps = 0
                break
        is_number = 0
        try:
            n = float(tag[0])
            is_number = 1
        except:
            pass
        # for i in range(3):
        # 	sentiment_score[i]=sentiment_score[i]/0.25+4
        iob_tag = ne_tag[0]
        if ne_tag == 'O':
            ne_tag = ''
        else:
            ne_tag = ne_tag[2:]
        hypernyms = [synset]
        last_two_synsets = [None, None]
        same_synset = [0, 0]
        if synset is not None:
            while len(hypernyms[-1].hypernyms()) > 0:
                hypernyms.append(hypernyms[-1].hypernyms()[0])
            last_two_synsets = [hypernyms[-1].name(), None]
            same_synset[0] = int(last_two_synsets[0] == synset.name())
            if len(hypernyms) > 1:
                last_two_synsets[1] = hypernyms[-2].name()
                same_synset[1] = int(last_two_synsets[1] == synset.name())
        syntax_info.append([tag_no] + sentiment_score + [iob_tag, ne_tag] +
                           last_two_synsets + same_synset +
                           [start_caps, allcaps, is_number,
                            len(tag[0])])
    return syntax_info

示例#30

0

显示文件

文件： iob_to_tree.py 项目： eshanmherath/natural-language-processing

import nltk
from nltk.chunk import conlltags2tree, tree2conlltags

sentence = 'Elon and Hawking met at SpaceX last Tuesday to discuss Artificial Intelligence'

try:
    tokenized_sentence = nltk.word_tokenize(sentence)
    tagged_sentence = nltk.pos_tag(tokenized_sentence)
    named_entity_tree = nltk.ne_chunk(tagged_sentence)
    iob_tagged = tree2conlltags(named_entity_tree)
    ne_tree = conlltags2tree(iob_tagged)
    for i in ne_tree:
        print(i)
except Exception as e:
    print(e)

示例#31

0

显示文件

文件： chunking.py 项目： ceena/stackoverflow-media-mining

 def __init__(self, train_sentences):
     train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
                   for sent in train_sentences]
     self.tagger = BigramTagger(train_data)

示例#32

0

显示文件

文件： main.py 项目： juancargm/PLN_Practices

            text = file.read()

        sentences = sentence_tokenizer.tokenize(text)
        persons = {}
        organizations = {}
        locations = {}
        geopolitical_entities = {}
        groups = {}
        facilities = {}
        multi_word = ''

        for sentence in sentences:

            tags = tagger.tag(tokenizer.tokenize(sentence))
            ne_tree_multiclass = multiclass_ner.parse(tags)
            iob_tagged_multiclass = tree2conlltags(ne_tree_multiclass)

            for current, next_value in zip(iob_tagged_multiclass, iob_tagged_multiclass[1:]):

                entity, category, next_entity, next_category = current[0], current[2], next_value[0], next_value[2]

                if 'B-' in category and next_category != 'O':
                    multi_word = entity
                    continue

                if 'I-' in category and next_category != 'O':
                    multi_word = multi_word + ' ' + entity
                    continue

                if 'I-' in category:
                    multi_word = multi_word + ' ' + entity

示例#33

0

显示文件

def _conll(tokens):
    pos_tags = nltk.pos_tag(tokens)
    named_entities = ne_chunk(pos_tags)
    return [(x[0], x[2]) for x in tree2conlltags(named_entities)]

示例#34

0

显示文件

文件： relation_extraction.py 项目： wrrrrn/delve-machine

 def __init__(self):
     train_sents = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=['NP'])
     train_data = [[(t, c) for _, t, c in tree2conlltags(sent)] for sent in train_sents]
     unigram_tagger = nltk.UnigramTagger(train_data)
     self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)

示例#35

0

显示文件

from proper_nouns.funcs.utilities import parse_ner_counts
from proper_nouns.funcs.utilities import parse_census_counts
from proper_nouns.funcs.utilities import tokenize_string

download_required_nltk_packages()

all_census_names = get_all_census_names()

tags = ['B-PERSON', 'I-PERSON']
census = {'truth_names': 0, 'difference': 0, 'no_names': 0}
tagged = {'truth_names': 0, 'test_names': 0, 'test_minus_tagged': 0, 'tagged_minus_test': 0, 'no_names': 0}

n = 0
corpus = read_gmb_corpus('tags')

for tagged_tokens in corpus:
    sentence = ' '.join([iob[0] for iob in tagged_tokens])

    test_the_tokens = tokenize_string(sentence)
    ne_tree = ne_chunk(test_the_tokens)
    test_tagged_tokens = tree2conlltags(ne_tree)
    ner_counts = is_person_tagged(tagged_tokens, test_tagged_tokens, tags)
    parse_ner_counts(ner_counts, tagged)

    census_counts = people_in_census(tagged_tokens, all_census_names, tags)
    parse_census_counts(census_counts, census)

    n += 1
    if n % 2000 == 0:
        print_intermediate_results(n, tagged, census)

示例#36

0

显示文件

文件： 4_explore_conll2000.py 项目： lswh/pytorchstudy

#   (NP September/NNP)
#   ,/,
#   due/JJ
#   (PP for/IN)
#   (NP release/NN)
#   (NP tomorrow/NN)
#   ,/,
#   (VP fail/VB to/TO show/VB)
#   (NP a/DT substantial/JJ improvement/NN)
#   (PP from/IN)
#   (NP July/NNP and/CC August/NNP)
#   (NP 's/POS near-record/JJ deficits/NNS)
#   ./.)

from nltk.chunk import tree2conlltags
iob_tagged = tree2conlltags(chunked_sentence)
print(iob_tagged)

# [
#   ('Confidence', 'NN', 'B-NP'),
#   ('in', 'IN', 'B-PP'),
#   ('the', 'DT', 'B-NP'),
#   ('pound', 'NN', 'I-NP'),
#   ('is', 'VBZ', 'B-VP'),
#   ('widely', 'RB', 'I-VP'),
#   ('expected', 'VBN', 'I-VP'),
#   ('to', 'TO', 'I-VP'),
#   ('take', 'VB', 'I-VP'),
#   ('another', 'DT', 'B-NP'),
#   ('sharp', 'JJ', 'I-NP'),
#   ('dive', 'NN', 'I-NP'),

示例#37

0

显示文件

文件： train_chunker.py 项目： Web5design/LexiconLinking

	def __init__(self, train_sents, *args, **kwargs):
		tag_sents = [tree2conlltags(sent) for sent in train_sents]
		train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
		self.tagger = ClassifierTagger.train(train_chunks, *args, **kwargs)

示例#38

0

显示文件

文件： simple_chunk_parser.py 项目： azeem/Project544

 def __init__(self, trainingChunkedSents):
     trainingData = [
         [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)]
         for chunkedSent in trainingChunkedSents 
     ]
     self.tagger = TrigramTagger(trainingData)

示例#39

0

显示文件

文件： conll2002evaluation.py 项目： bfurlan/IE4MAS

__author__ = 'User'

""" conll2002 is in Duch and Spanish so its not woriking well with that """


from nltk.corpus import conll2002
from nltk.chunk import tree2conlltags
import pandas as pd
from evaluate import evaluate
from mit_ie.mitie_series_ner_extractror import mitie_extract_ner_series
from stanford_ner.stanford_series_ner_extractor import stanford_extract_ner_series


chunked_words = tree2conlltags(conll2002.chunked_words())
df = pd.DataFrame(chunked_words, columns=['word', 'tmp', 'real_tag'])

# remove tmp col
df = df.loc[:, ["word", "real_tag"]]

# strip first two chars - "B-..." and "I-..."
df['real_tag'] = map(lambda x: x[2:] if len(x) > 2 else x, df['real_tag'])

# testing
df = df[:5000]

df.real_tag = list(df.real_tag)
df.word = map(unicode, df.word)

# df = add_dataframe_ner_tags(corpus_df=df, ner_extractor=mitie_extract_ner_series)
#
# print('###### MIT IE NER evaluation #####')