示例#1
0
def frame_chose():
    fs = fn.frames()
    fs_dic = {}
    fs_ID = []
    for f in fs:
        fs_ID.append(f.ID)
        fs_dic[f.name] = []
        lexes = f.lexUnit
        for lex in lexes:
            fs_dic[f.name].append(lexes[lex].name)

    fs_ID_copy = fs_ID
    result = []
    for f1 in fs_ID:
        fs_ID_copy.remove(f1)
        f1_name = fn.frame(f1).name
        set1 = set(fs_dic[f1_name])
        for f2 in fs_ID_copy:
            f2_name = fn.frame(f2).name
            set2 = set(fs_dic[f2_name])
            r = list(set1 & set2)
            result.append((f1_name, f2_name, r, len(r)))

    result = sorted(result, key=lambda x: (x[3]), reverse=True)

    frame_chose = []
    for r in result:
        if r[3] >= 10:
            frame_chose.append(r)

    return frame_chose
示例#2
0
文件: util.py 项目: JessieSalas/Fil
def getFrames(phrase):
    frames = []
    content = [s.translate(string.maketrans("",""), string.punctuation) for s in phrase.split()]
    for lemma in content:
        frame = fn.frames(lemma)
        if frame != []:
            frames.append([f.name for f in frame])
    return frames
示例#3
0
def set_FN_embeddings():
    embeddings = {}
    frames = fn.frames()
    vector = np.zeros(len(frames))
    for i in range(len(frames)):
        name = frames[i].name
        vector_i = np.copy(vector)
        vector_i[i] = 1.0
        embeddings[name] = vector_i
    return embeddings, len(frames)
示例#4
0
def print_frame(name_re):    
    for m_frame in fn.frames(name_re):
        #m_frame = fn.frame(299)
        print 'Unincorporated', [x.name for x in m_frame.lexUnit.values() if 'incorporatedFE' not in x]
        for relation in m_frame['frameRelations']:
            print '  ', relation 
        for fe in m_frame['FE']:
            ailment_lus = [x for x in m_frame.lexUnit.values() if 'incorporatedFE' in x and x.incorporatedFE == fe]
            print '  ', fe
            print '  ', [x.name for x in ailment_lus]
        print '\n'  
示例#5
0
def expandByGraph(mappinglist):
    expandlist = list()
    for item in mappinglist:
        expandlist.append(item)
        for frame in fn.frames():
            if frame.name == item:
                for fr in frame.frameRelations:
                    if fr.type.name == 'Inheritance':
                        if 'Child' in fr and fr.Child.name != item:
                            expandlist.append(fr.Child.name)
                        elif 'Parent' in fr and fr.Parent.name != item:
                            expandlist.append(fr.Parent.name)
                    elif fr.type.name == 'See_also':
                        if 'ReferringEntry' in fr and fr.ReferringEntry.name != item:
                            expandlist.append(fr.ReferringEntry.name)
    return list(set(expandlist))
示例#6
0
def get_frame_to_root_information(di_g, fn, roots, verbose=0):
    """get all the relations from frames to their roots"""
    frame_to_root_information = {}

    for frame_obj in fn.frames():
        frame = frame_obj.name
        if not di_g.has_node(frame):
            root_information = [{
                'subframe': frame,
                'root': frame,
                'the_path': [frame],
                'len_path': 1
            }]
        else:
            root_information = []
            for root in roots:
                if nx.has_path(di_g, root, frame):
                    the_path = nx.shortest_path(di_g, root, frame)
                    len_path = len(the_path)
                    root_info = {
                        'subframe': frame,
                        'root': root,
                        'the_path': the_path,
                        'len_path': len_path
                    }
                    root_information.append(root_info)
        # check for 2> root paths
        #chosen_root_info = {}
        #min_path_length = 100000

        #for root_info in root_information:
        #    if root_info['len_path'] < min_path_length:
        #        min_path_length = root_info['len_path']
        #        chosen_root_info = root_info
        #assert chosen_root_info != {}

        frame_to_root_information[frame] = root_information

    assert len(frame_to_root_information) == 1221
    #path_lengths = [root_info['len_path']
    #                for root_info in frame_to_root_information.values()]

    #if verbose >= 1:
    #    print()
    #    print(f'distribution of path lengths: {Counter(path_lengths)}')
    return frame_to_root_information
示例#7
0
def extrac_framenet():
    results = []
    frames = fn.frames()
    for f in frames:
        temp = {}
        f_name = f.name
        f_definition = f.definition
        f_lexunit = f.lexUnit
        f_fes = f.FE

        temp['name'] = f_name
        temp['definition'] = f_definition
        temp['lexunit'] = list(f_lexunit.keys())
        temp['fes'] = [[fe, f_fes[fe].coreType, f_fes[fe].definition] for fe in f_fes]

        results.append(temp)

    file_object = open('../data/frame.json', 'w')
    json.dump(results, file_object)
示例#8
0
def getFrameSetForStudent(surname, list_len=5):
    frameList = []
    nof_frames = len(fn.frames())
    base_idx = (
        abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) %
        nof_frames)
    print('\nstudent: ' + surname)
    framenet_IDs = get_frams_IDs()
    i = 0
    offset = 0
    seed(1)
    while i < list_len:
        fID = framenet_IDs[(base_idx + offset) % nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        print('\tID: {a:4d}\tframe: {framename}'.format(a=fID,
                                                        framename=fNAME))
        offset = randint(0, nof_frames)
        frameList.append(fID)
        i += 1
    return frameList
示例#9
0
def extract_examples():
    results = []
    frames = fn.frames()
    for f in frames:
        for lu in f.lexUnit:
            examples = f.lexUnit[lu].exemplars
            for example in examples:
                temp = {}
                temp['name'] = f.name
                temp['lexunit'] = lu
                temp['text'] = example.text
                if 'Target' in example:
                    temp['target'] = example.Target
                else:
                    print(example.text)
                temp['fe'] = example.FE
                results.append(temp)
                if len(results) % 100 == 0:
                    print('Processing...', len(results))

    file_object = open('../data/frame_examples.json', 'w')
    json.dump(results, file_object)
示例#10
0
def print_frames_with_IDs():
    for x in fn.frames():
        print('{}\t{}'.format(x.ID, x.name))
                pickle.dump(lcs, f)
        lcs_feats = ['lcs_eventive', 'lcs_stative']
        type_embedder['lcs'] = lcs

        # Wordnet supersenses(lexicographer names)
        synsets = wordnet.all_synsets()
        supersenses = \
              sorted(list(set(['supersense=' + x.lexname() for x in synsets])))

        # Framenet
        lem2frame = {}
        for lm in framenet.lus():
            for lemma in lm['lexemes']:
                (lem2frame[lemma['name'] + '.' + \
                        framenet_posdict[lemma['POS']]]) = lm['frame']['name']
        frame_names = sorted(['frame=' + x.name for x in framenet.frames()])
        type_embedder['lem2frame'] = lem2frame

        # Verbnet classids
        verbnet_classids = \
                     sorted(['classid=' + vcid for vcid in verbnet.classids()])

        type_hand_features = (verbnet_classids + supersenses + frame_names +
                              lcs_feats + conc_cols)
        input_size += len(type_hand_features)
        for f in type_hand_features:
            type_embedder['embedder'][f] = 0

    # Write all the feature names to a text file
    if args.type and args.token:
        with open('../../data/list_of_all_hand_eng_features.txt', 'w') as f:
示例#12
0
#! /usr/bin/env python
# Author: Kapil Thadani ([email protected])

from __future__ import division, with_statement
from nltk.corpus import framenet

###############################################################################

# Names of all frames in Framenet (1019 total)
frames = sorted(frame.name for frame in framenet.frames())

# Names of all possible FEs (1170 total)
fes = sorted(set(fe for frame in framenet.frames() for fe in frame.FE.keys()))

# Names of all possible frames and FEs (9633 total)
frame_fes = sorted([(frame.name, fe) for frame in framenet.frames()
                    for fe in frame.FE.iterkeys()],
                   key=lambda x: x[0] + x[1])

###############################################################################

# Core types of FEs
coretypes = ['Core', 'Peripheral', 'Extra-Thematic']

# Names of all possible FEs and coretypes (1491 total)
fe_coretypes = sorted(set((fe, frame_element.coreType)
                          for frame in framenet.frames()
                          for fe, frame_element in frame.FE.iteritems()),
                      key=lambda x: x[0] + x[1])

# Names of all possible frames and FEs and coretypes (9633 total)
示例#13
0
print_common_synsets(documents)

tps = corpus_probability(documents)
    
    
    
frames = extract_frames(documents)
counter = Counter(frames)
counter.most_common(25)





frames = fn.frames(r'Mental_stimulus_stimulus_focus')
for frame in frames:
    print set(frame.lexUnit.keys())
    lus = [x for x in frame.lexUnit.values() if 'incorporatedFE' in x ]
    print('   ', [x.name for x in lus])
    


print_frame(r'Emotions_of_mental_activity')



frames = []
frames += fn.frames(r'.*(?i)mental.*')
frames += fn.frames(r'.*(?i)medical.*')
示例#14
0
        f.close()

        # LCS eventivity
        from lcsreader import LexicalConceptualStructureLexicon
        lcs = LexicalConceptualStructureLexicon(home + '/Desktop/protocols/data/verbs-English.lcs')
        lcs_feats = ['lcs_eventive', 'lcs_stative']

        # Wordnet supersenses(lexicographer names)
        supersenses = list(set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

        # Framenet
        lem2frame = {}
        for lm in framenet.lus():
            for lemma in lm['lexemes']:
                lem2frame[lemma['name'] + '.' + framnet_posdict[lemma['POS']]] = lm['frame']['name']
        frame_names = ['frame=' + x.name for x in framenet.frames()]

        # Verbnet classids
        verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

        dict_feats = {}
        for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
            dict_feats[f] = 0

        x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_x, tokens, lemmas)])

        dev_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_dev_x, dev_tokens, dev_lemmas)])

        test_x_pd = pd.DataFrame([features_func(sent_feat=sent, token=token, lemma=lemma, dict_feats=dict_feats.copy(), prot=args.prot, concreteness=concreteness, lcs=lcs, l2f=lem2frame) for sent, token, lemma in zip(raw_test_x, test_tokens, test_lemmas)])

        feature_names = (verbnet_classids, supersenses, frame_names, lcs_feats, conc_cols, lexical_feats, all_ud_feature_cols)
framenetRoot=generalThing.find("framenet")
frameElement=framenetRoot.find("frame element")
lexicalUnit=framenetRoot.find("lexical unit")
semType=framenetRoot.find("semantic type")
id_=framenetRoot.find("id")
frames=framenetRoot.find("frame")

for fE in fn.fes():
	if fE.semType!=None:
		semanticTypeKatum=exactSemType(fE.semType)
		frameElementkatum=exactFE(fE)
		if(semanticTypeKatum!=None and frameElementkatum!=None):
			frameElementkatum._is(semanticTypeKatum,False)

for lU in fn.lus():
	if len(lU.semTypes)!=0:
		for semTypeInstance in lU.semTypes:
			semanticTypeKatum=exactSemType(semTypeInstance)
			lUkatum=exactlU(lU)
			if(semanticTypeKatum!=None and lUkatum!=None):
				lUkatum._is(semanticTypeKatum,False)

for frame in fn.frames():
	if len(frame.semTypes)!=0:
		for semTypeInstance in frame.semTypes:
			semanticTypeKatum=exactSemType(semTypeInstance)
			frameKatum=exactFrame(frame)
			if(semanticTypeKatum!=None and frameKatum!=None):
				frameKatum._is(semanticTypeKatum,False)

generalThing.save('wordnet-verbnet-framenet.datum')
示例#16
0
bert_embedding = BertEmbeddings('bert-base-cased')

from flair.embeddings import StackedEmbeddings

# now create the StackedEmbedding object that combines all embeddings
stacked_embeddings = StackedEmbeddings(
    embeddings=[
        #flair_forward_embedding,
        #flair_backward_embedding,
        bert_embedding])

import nltk
nltk.download('framenet_v17')

from nltk.corpus import framenet as fn
len(fn.frames())

txt=preprocess.read_pg(data_root + r'\EN_1818_Shelley,Mary_Frankenstein_Novel.txt')
print(len(txt), 'chars')

from segtok.segmenter import split_single
sentences = [Sentence(s, use_tokenizer=True) for s in split_single(txt)]
print(len(sentences), 'sentences')

import random as rand

t = range(100)#rand.sample(range(len(sentences)), 100)
sents_sample = [sentences[i] for i in sorted(t)]

t = np.array(t)
_ = bert_embedding.embed(sents_sample)
示例#17
0
    else:
        reverse_dict[word] = {}
        reverse_dict[word][event] = features_dict


reverse_dict = {}
nominal_dict = {}
for word in word_list:
    event = event_verb_mapping[word]
    nominal_dict[word] = []
    nominals = {}
    #added_nouns = set()
    #frames = fn.frames_by_lemma(word)
    if use_framenet and word in frame_dict:
        for frame_index in frame_dict[word]:
            if len(fn.frames(frame_index)) < 1:
                print "no frame for: ", frame_index
                continue
            frame = fn.frames(frame_index)[0]
            for potential_noun in frame.lexUnit.keys():
                lemma = potential_noun.split('.')[0]
                pos = potential_noun.split('.')[1]
                if pos == 'n' or add_verbs:
                    features_dict = {}
                    features_dict['event'] = event
                    features_dict['word'] = lemma
                    features_dict['pos'] = pos
                    #features_dict['synset'] = float('nan')
                    features_dict['fn'] = 1
                    features_dict['num_wordnet'] = 0
                    features_dict['synset_percent'] = 0
示例#18
0
"""Yields a graph for FN instead of the unwieldy labyrinth of nested dicts
We use a philosophy similar to conllreader and put stuff in the nodes, maybe as dicts or maybe as a class"""

from nltk.corpus import framenet as fn
fn.propagate_semtypes()


framekeys = set()
frametypes = set()

for fx in fn.frames():
    for k in fx.keys():
        framekeys.add(k)
    if fx['semTypes']:
        for t in fx['semTypes']:
            frametypes.add(t['name'])
#We could read straight from the
print(frametypes)


#for k in framekeys:
#    print(k,fn.frames()[0][k])
#    print(k,fn.frames()[1][k])
#    print(k,fn.frames()[2][k])

示例#19
0
def closure_graph(synset, fn):
    seen = set()
    graph = nx.DiGraph()

    def recurse(s):
        if not s in seen:
            seen.add(s)
            graph.add_node(s.name())
            for s1 in fn(s):
                graph.add_node(s1.name())
                graph.add_edge(s.name(), s1.name())
                recurse(s1)

    recurse(synset)
    return graph


dog = fn.frames(r'(?i)medical')
print(str(dog.name))
G = closure_graph(dog.name, lambda s: s.hypernyms())
index = nx.betweenness_centrality(G)
plt.rc('figure', figsize=(12, 7))
node_size = [index[n] * 1000 for n in G]
pos = nx.spring_layout(G)
nx.draw_networkx(G,
                 pos,
                 node_size=node_size,
                 edge_color='r',
                 alpha=.3,
                 linewidths=0)
plt.show()
示例#20
0
    def load_framenet():
        edges = []
        for frm in fn.frames():
            # frame-frame relations
            for fe in frm.frameRelations:
                edges = pretty_frame_edge(edges, frm_id(fe.superFrameName),
                                          frm_id(fe.subFrameName),
                                          ncheck(fe.type.name))

            # lexical units
            for lu in frm.lexUnit.keys():
                edges.append([
                    frm_id(frm.name), 'fn:HasLexicalUnit',
                    lu_format(lu, frm.name)
                ])

            # FE
            for fe in frm.FE.values():
                if isinstance(fe.semType,
                              nltk.corpus.reader.framenet.AttrDict):

                    edges.append([
                        fe_id(fe.name), 'fn:HasSemType',
                        fe_semtype_id(fe.semType.name)
                    ])

                    edges.append([
                        fe_semtype_id(fe.semType.name), 'fn:st:RootType',
                        fe_semtype_id(fe.semType.rootType.name)
                    ])

                    edges.append([
                        fe_semtype_id(fe.semType.name), 'fn:st:SuperType',
                        fe_semtype_id(fe.semType.superType.name)
                    ])

                    for fesub in fe.semType.subTypes:
                        edges.append([
                            fe_semtype_id(fe.semType.name), 'fn:st:SubType',
                            fe_semtype_id(fesub.name)
                        ])

                if isinstance(fe.requiresFE,
                              nltk.corpus.reader.framenet.AttrDict):
                    edges.append([
                        fe_id(fe.name), 'fn:fe:RequiresFE',
                        fe_id(fe.requiresFE.name)
                    ])

                if isinstance(fe.excludesFE,
                              nltk.corpus.reader.framenet.AttrDict):
                    edges.append([
                        fe_id(fe.name), 'fn:fe:ExcludesFE',
                        fe_id(fe.excludesFE.name)
                    ])

                # coreType as edge feature
                edges.append(
                    [frm_id(frm.name), 'fn:HasFrameElement',
                     fe_id(fe.name)])
        return edges
示例#21
0
def get_global_frame_dictionary():
    frame_dict = {f["name"]: i for i, f in enumerate(fn.frames())}
    return frame_dict
示例#22
0
def get_frams_IDs():
    return [f.ID for f in fn.frames()]
示例#23
0
    def load_framenet():
        edges = []
        for frm in fn.frames():
            # frame-frame relations
            for fe in frm.frameRelations:
                edges = pretty_frame_edge(edges, frm_id(fe.superFrameName),
                                          frm_id(fe.subFrameName),
                                          ncheck(fe.type.name))

            # lexical units
            for lu in frm.lexUnit.keys():
                edges.append([
                    frm_id(frm.name), 'fn:HasLexicalUnit',
                    lu_format(lu, frm.name)
                ])

            # FE
            for fe in frm.FE.values():
                if isinstance(fe.semType,
                              nltk.corpus.reader.framenet.AttrDict):

                    # Sem type
                    semtype_edge = [
                        fe_id(fe.name),
                        '/r/IsA',  #'fn:HasSemType',
                        fe_semtype_id(fe.semType.name)
                    ]
                    if semtype_edge not in edges:
                        edges.append(semtype_edge)

                    # Root type
                    root_edge = [
                        fe_semtype_id(fe.semType.name),
                        '/r/IsA',  # 'fn:st:RootType'
                        fe_semtype_id(fe.semType.rootType.name)
                    ]
                    if root_edge not in edges:
                        edges.append(root_edge)

                    # Super type
                    super_edge = [
                        fe_semtype_id(fe.semType.name),
                        '/r/IsA',  #'fn:st:SuperType',
                        fe_semtype_id(fe.semType.superType.name)
                    ]
                    if super_edge not in edges:
                        edges.append(super_edge)

                    # Sub type
                    for fesub in fe.semType.subTypes:
                        sub_edge = [
                            fe_semtype_id(fesub.name), '/r/IsA',
                            fe_semtype_id(fe.semType.name)
                        ]
                        if sub_edge not in edges:
                            edges.append(sub_edge)
                        #edges.append([fe_semtype_id(fe.semType.name),
                        #            'fn:st:SubType',
                        #            fe_semtype_id(fesub.name)])

                # Requires FE
                if isinstance(fe.requiresFE,
                              nltk.corpus.reader.framenet.AttrDict):
                    req_edge = [
                        fe_id(fe.name), '/r/HasPrerequisite',
                        fe_id(fe.requiresFE.name)
                    ]
                    if req_edge not in edges:
                        edges.append(req_edge)
                    #edges.append([fe_id(fe.name), 'fn:fe:RequiresFE', fe_id(fe.requiresFE.name)])

                # Excludes FE
                if isinstance(fe.excludesFE,
                              nltk.corpus.reader.framenet.AttrDict):
                    excl_edge = [
                        fe_id(fe.name), '/r/RelatedTo',
                        fe_id(fe.excludesFE.name)
                    ]
                    if excl_edge not in edges:
                        edges.append(excl_edge)
                    #edges.append([fe_id(fe.name), 'fn:fe:ExcludesFE', fe_id(fe.excludesFE.name)])

                # HasFrameElement - coreType as edge feature
                hasfe_edge = [
                    frm_id(frm.name),
                    '/r/HasA',  #'fn:HasFrameElement',
                    fe_id(fe.name)
                ]
                if hasfe_edge not in edges:
                    edges.append(hasfe_edge)
        return edges
示例#24
0
#! /usr/bin/env python
# Author: Kapil Thadani ([email protected])

from __future__ import division, with_statement
from nltk.corpus import framenet


###############################################################################

# Names of all frames in Framenet (1019 total)
frames = sorted(frame.name for frame in framenet.frames())


# Names of all possible FEs (1170 total)
fes = sorted(set(fe for frame in framenet.frames() for fe in frame.FE.keys()))


# Names of all possible frames and FEs (9633 total)
frame_fes = sorted([(frame.name, fe)
                        for frame in framenet.frames()
                        for fe in frame.FE.iterkeys()],
                   key=lambda x: x[0] + x[1])

###############################################################################

# Core types of FEs
coretypes = ['Core', 'Peripheral', 'Extra-Thematic']


# Names of all possible FEs and coretypes (1491 total)
fe_coretypes = sorted(set((fe, frame_element.coreType)
示例#25
0
def demo():
    from pprint import pprint
    from nltk.corpus import framenet as fn

    #
    # It is not necessary to explicitly build the indexes by calling
    # buildindexes(). We do this here just for demo purposes. If the
    # indexes are not built explicitely, they will be built as needed.
    #
    print('Building the indexes...')
    fn.buildindexes()

    #
    # Get some statistics about the corpus
    #
    print('Number of Frames:', len(fn.frames()))
    print('Number of Lexical Units:', len(fn.lexical_units()))
    print('Number of annotated documents:', len(fn.documents()))
    print()

    #
    # Frames
    #
    print(
        'getting frames whose name matches the (case insensitive) regex: "(?i)medical"'
    )
    medframes = fn.frames(r'(?i)medical')
    print('Found {0} Frames whose name matches "(?i)medical":'.format(
        len(medframes)))
    print([(f.name, f.ID) for f in medframes])

    #
    # store the first frame in the list of frames
    #
    tmp_id = medframes[0].ID
    m_frame = fn.frame(tmp_id)  # reads all info for the frame

    #
    # get the frame relations
    #
    print(
        '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(
            m_frame.name, m_frame.ID), len(m_frame.frameRelation))
    for fr in m_frame.frameRelation:
        print('   ', fr.type + ":", fr.relatedFrame)

    #
    # get the names of the Frame Elements
    #
    print(
        '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
        len(m_frame.FE))
    print('   ', [x.name for x in m_frame.FE])

    #
    # get the names of the "Core" Frame Elements
    #
    print('\nThe "core" Frame Elements in the "{0}" frame:'.format(
        m_frame.name))
    print('   ', [x.name for x in m_frame.FE if x.coreType == "Core"])

    #
    # get all of the Lexical Units that are incorporated in the
    # 'Ailment' FE of the 'Medical_conditions' frame (id=239)
    #
    print('\nAll Lexical Units that are incorporated in the "Ailment" FE:')
    m_frame = fn.frame(239)
    ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment']
    print([x.name for x in ailment_lus])

    #
    # get all of the Lexical Units for the frame
    #
    print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
          len(m_frame.lexUnit))
    print('  ', [x.name for x in m_frame.lexUnit[:5]], '...')

    #
    # get basic info on the second LU in the frame
    #
    tmp_id = m_frame.lexUnit[1].ID  # grab the id of the second LU
    luinfo = fn.lu_basic(tmp_id)  # get basic info on the LU
    print('\nInformation on the LU: {0}'.format(luinfo.name))
    pprint(luinfo)

    #
    # Get a list of all of the corpora used for fulltext annotation
    #
    print('\nNames of all of the corpora used for fulltext annotation:')
    allcorpora = set([x.corpname for x in fn.documents()])
    pprint(list(allcorpora))

    #
    # Get the names of the annotated documents in the first corpus
    #
    firstcorp = list(allcorpora)[0]
    firstcorp_docs = fn.documents(firstcorp)
    print('\nNames of the annotated documents in the "{0}" corpus:'.format(
        firstcorp))
    pprint([x.filename for x in firstcorp_docs])

    #
    # Search for frames containing LUs whose name attribute matches a
    # regexp pattern.
    #
    # Note: if you were going to be doing a lot of this type of
    #       searching, you'd want to build an index that maps from
    #       lemmas to frames because each time frames_by_lemma() is
    #       called, it has to search through ALL of the frame XML files
    #       in the db.
    print(
        '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":'
    )
    pprint(fn.frames_by_lemma(r'^run.v$'))
示例#26
0
文件: frames.py 项目: rsteckel/EDA
fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238)

fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame

fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality')


fn.lus('look.n')[0].frame
fn.lus('look.n')[1].frame


for f in fn.lus('look.n'):
    print f.frame.name


result = fn.frames(r'(?i)erception')

print result
f = fn.frame(1301)

f.ID
f.definition
for u in f.lexUnit:
    print u

fn.lexical_units('r(?i)look')


from pattern.en import wordnet

示例#27
0
文件: framenet.py 项目: Tahnan/nltk
def demo():
    from pprint import pprint
    from nltk.corpus import framenet as fn

    #
    # It is not necessary to explicitly build the indexes by calling
    # buildindexes(). We do this here just for demo purposes. If the
    # indexes are not built explicitely, they will be built as needed.
    #
    print('Building the indexes...')
    fn.buildindexes()

    #
    # Get some statistics about the corpus
    #
    print('Number of Frames:', len(fn.frames()))
    print('Number of Lexical Units:', len(fn.lexical_units()))
    print('Number of annotated documents:', len(fn.documents()))
    print()

    #
    # Frames
    #
    print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"')
    medframes = fn.frames(r'(?i)medical')
    print(
        'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes)))
    print([(f.name, f.ID) for f in medframes])

    #
    # store the first frame in the list of frames
    #
    tmp_id = medframes[0].ID
    m_frame = fn.frame(tmp_id)  # reads all info for the frame

    #
    # get the frame relations
    #
    print(
        '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name,
                                                                        m_frame.ID),
        len(m_frame.frameRelation))
    for fr in m_frame.frameRelation:
        print('   ', fr.type + ":", fr.relatedFrame)

    #
    # get the names of the Frame Elements
    #
    print(
        '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name),
        len(m_frame.FE))
    print('   ', [x.name for x in m_frame.FE])

    #
    # get the names of the "Core" Frame Elements
    #
    print(
        '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name))
    print('   ', [x.name for x in m_frame.FE if x.coreType == "Core"])

    #
    # get all of the Lexical Units that are incorporated in the
    # 'Ailment' FE of the 'Medical_conditions' frame (id=239)
    #
    print('\nAll Lexical Units that are incorporated in the "Ailment" FE:')
    m_frame = fn.frame(239)
    ailment_lus = [x for x in m_frame.lexUnit if x.incorporatedFE == 'Ailment']
    print([x.name for x in ailment_lus])

    #
    # get all of the Lexical Units for the frame
    #
    print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name),
          len(m_frame.lexUnit))
    print('  ', [x.name for x in m_frame.lexUnit[:5]], '...')

    #
    # get basic info on the second LU in the frame
    #
    tmp_id = m_frame.lexUnit[1].ID  # grab the id of the second LU
    luinfo = fn.lu_basic(tmp_id)  # get basic info on the LU
    print('\nInformation on the LU: {0}'.format(luinfo.name))
    pprint(luinfo)

    #
    # Get a list of all of the corpora used for fulltext annotation
    #
    print('\nNames of all of the corpora used for fulltext annotation:')
    allcorpora = set([x.corpname for x in fn.documents()])
    pprint(list(allcorpora))

    #
    # Get the names of the annotated documents in the first corpus
    #
    firstcorp = list(allcorpora)[0]
    firstcorp_docs = fn.documents(firstcorp)
    print(
        '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp))
    pprint([x.filename for x in firstcorp_docs])

    #
    # Search for frames containing LUs whose name attribute matches a
    # regexp pattern.
    #
    # Note: if you were going to be doing a lot of this type of
    #       searching, you'd want to build an index that maps from
    #       lemmas to frames because each time frames_by_lemma() is
    #       called, it has to search through ALL of the frame XML files
    #       in the db.
    print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":')
    pprint(fn.frames_by_lemma(r'^run.v$'))
示例#28
0
from nltk.corpus import framenet as fn

import pattern.search as PS
from pattern.search import Pattern, Classifier, search
from pattern.en import parse, parsetree
from pattern.en import wordnet as pwn
from nltk.corpus import wordnet as wn
from nltk.corpus import framenet as fn
import pandas as pd
import numpy as np

from datasets.customers.tufamilia_dataset import TuFamilia



frames = fn.frames('Medical_conditions')
frames = fn.frames('Causation')
frame = frames[0]  #Take first match

lus = frame['lexUnit'].values()
for lu in lus:
    if lu.has_key('incorporatedFE'):
        print '%20s %10s' % (lu.name, lu['incorporatedFE'])
    else:
        print '%20s %10s' % (lu.name, 'No IFE')

for relation in frame['frameRelations']:
    print '  ', relation 
    
    
for fe in frame['FE']:
示例#29
0
def hand_engineering(prot, batch_size, data, data_dev):
    '''
        Hand engineered feature extraction. Supports the following - UD,
        Verbnet classids, Wordnet supersenses, concreteness ratings, LCS
        eventivity scores
    '''
    home = expanduser("~")
    framnet_posdict = {
        'V': 'VERB',
        'N': 'NOUN',
        'A': 'ADJ',
        'ADV': 'ADV',
        'PREP': 'ADP',
        'NUM': 'NUM',
        'INTJ': 'INTJ',
        'ART': 'DET',
        'C': 'CCONJ',
        'SCON': 'SCONJ',
        'PRON': 'PRON',
        'IDIO': 'X',
        'AVP': 'ADV'
    }
    # Load the features
    features = {}
    with open(home + '/Desktop/protocols/data/features-2.tsv', 'r') as f:
        for line in f.readlines():
            feats = line.split('\t')
            features[feats[0]] = (feats[1].split(), feats[2].split())

    # Load the predpatt objects for creating features
    files = [
        '/Downloads/UD_English-r1.2/en-ud-train.conllu',
        '/Downloads/UD_English-r1.2/en-ud-dev.conllu',
        '/Downloads/UD_English-r1.2/en-ud-test.conllu'
    ]
    home = expanduser("~")
    options = PredPattOpts(resolve_relcl=True,
                           borrow_arg_for_relcl=True,
                           resolve_conj=False,
                           cut=True)  # Resolve relative clause
    patt = {}

    for file in files:
        path = home + file
        with open(path, 'r') as infile:
            for sent_id, ud_parse in load_conllu(infile.read()):
                patt[file[33:][:-7] + " " + sent_id] = PredPatt(ud_parse,
                                                                opts=options)

    data['Structure'] = data['Split.Sentence.ID'].map(lambda x:
                                                      (patt[x], features[x]))
    data_dev['Structure'] = data_dev['Split.Sentence.ID'].map(
        lambda x: (patt[x], features[x]))

    raw_x = data['Structure'].tolist()
    raw_dev_x = data_dev['Structure'].tolist()

    all_x = raw_x + raw_dev_x
    all_feats = '|'.join(['|'.join(all_x[i][1][0]) for i in range(len(all_x))])
    feature_cols = Counter(all_feats.split('|'))

    # All UD dataset features
    all_ud_feature_cols = list(
        feature_cols.keys()) + [(a + "_dep") for a in feature_cols.keys()]

    # Concreteness
    f = open(home + '/Desktop/protocols/data/concrete.pkl', 'rb')
    concreteness = pickle.load(f)
    if prot == 'arg':
        conc_cols = ['concreteness']
    else:
        conc_cols = ['concreteness', 'max_conc', 'min_conc']
    f.close()

    # LCS eventivity
    from lcsreader import LexicalConceptualStructureLexicon
    lcs = LexicalConceptualStructureLexicon(
        home + '/Desktop/protocols/data/verbs-English.lcs')
    lcs_feats = ['lcs_eventive', 'lcs_stative']

    # Wordnet supersenses(lexicographer names)
    supersenses = list(
        set(['supersense=' + x.lexname() for x in wordnet.all_synsets()]))

    # Framenet
    lem2frame = {}
    for lm in framenet.lus():
        for lemma in lm['lexemes']:
            lem2frame[lemma['name'] + '.' +
                      framnet_posdict[lemma['POS']]] = lm['frame']['name']
    frame_names = ['frame=' + x.name for x in framenet.frames()]

    # Verbnet classids
    verbnet_classids = ['classid=' + vcid for vcid in verbnet.classids()]

    # Lexical features
    lexical_feats = [
        'can', 'could', 'should', 'would', 'will', 'may', 'might', 'must',
        'ought', 'dare', 'need'
    ] + [
        'the', 'an', 'a', 'few', 'another', 'some', 'many', 'each', 'every',
        'this', 'that', 'any', 'most', 'all', 'both', 'these'
    ]

    dict_feats = {}
    for f in verbnet_classids + lexical_feats + supersenses + frame_names + lcs_feats + all_ud_feature_cols + conc_cols:
        dict_feats[f] = 0

    x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame) for sent, token, lemma in
        zip(raw_x, data['Root.Token'].tolist(), data['Lemma'].tolist())
    ])

    dev_x_pd = pd.DataFrame([
        features_func(sent_feat=sent,
                      token=token,
                      lemma=lemma,
                      dict_feats=dict_feats.copy(),
                      prot=prot,
                      concreteness=concreteness,
                      lcs=lcs,
                      l2f=lem2frame)
        for sent, token, lemma in zip(raw_dev_x, data_dev['Root.Token'].tolist(
        ), data_dev['Lemma'].tolist())
    ])

    # Figure out which columns to drop(they're always zero)
    todrop1 = dev_x_pd.columns[(dev_x_pd == 0).all()].values.tolist()
    todrop = x_pd.columns[(x_pd == 0).all()].values.tolist()
    intdrop = [a for a in todrop if a not in todrop1]
    cols_to_drop = cols_to_drop = list(set(todrop) - set(intdrop))

    x = x_pd.drop(cols_to_drop, axis=1).values.tolist()
    dev_x = dev_x_pd.drop(cols_to_drop, axis=1).values.tolist()

    x = [[a[:] for a in x[i:i + batch_size]]
         for i in range(0, len(data), batch_size)]
    dev_x = [[a[:] for a in dev_x[i:i + batch_size]]
             for i in range(0, len(data_dev), batch_size)]
    return x, dev_x
示例#30
0
文件: info_ext.py 项目: rsteckel/EDA


doccollections = ['NYT_19980407','NYT_19980403','NYT_19980315','APW_19980429','APW_19980424','APW_19980314']

IN = re.compile(r'.*\bin\b(?!\b.+ing)')

for doccol in doccollections:
    for doc in nltk.corpus.ieer.parsed_docs(doccol):
        relations = nltk.sem.extract_rels('PER', 'LOC', doc, corpus='ieer', pattern = IN)
        for relation in relations:
            print nltk.sem.relextract.rtuple(relation)
            


f = fn.frames(r'(?i)perception')
len(fn.frames())
f = fn.frame(66)

f.ID
f.definition
set(f.lexUnit.keys())

[x.name for x in f.FE]

f.frameRelations


fn.frames_by_lemma(r'(?i)a little')

示例#31
0
                     '\t_\t' + \
                     tag + \
                     '\t_\t' + \
                     str(head_idx) + '\t' + \
                     dep + \
                     '\t_\t_\n'
        if len(s) > 0:
            conll += '\n'

    return conll


# def compile_framenet_starters():
print('loading dub frames')
dub_frames = [
    full_frame.name for full_frame in fn.frames()
    if len(full_frame.name.split('_')) > 1
]
FDD = defaultdict(list)
for dub_frame in dub_frames:
    FDD[dub_frame.split('_')[0]].append(dub_frame)
    # return fdd


# @clock
def get_frame_from_name(frame_name):
    try:
        frame = fn.frame_by_name(frame_name)
    except:
        if len(FDD[frame_name]) == 1:
            frame = fn.frame_by_name(FDD[frame_name][0])
示例#32
0
from nltk.corpus import framenet as fn

fs = fn.frames()

for i in fs:
    print(i.FE)
    break
示例#33
0
__author__ = 'juliewe'

from nltk.corpus import framenet as fn

if __name__=='__main__':
    print len(fn.frames())