示例#1
0
    def test_write_srl_labels(self):
        infilenames = [
            config.corpus_path + 'srl_iob.train',
            config.corpus_path + 'srl_iob.dev',
            config.corpus_path + 'srl_iob.test'
        ]
        outfilenames = [
            config.corpus_path + 'srl_labels.train',
            config.corpus_path + 'srl_labels.dev',
            config.corpus_path + 'srl_labels.test'
        ]

        #create categories file, remove infrequent categories
        categories = set()
        cat_count = dict()
        for filename in infilenames:
            sentences_tags_verbs = read_srl(filename)
            for stv in sentences_tags_verbs:
                _, tagslist, _ = stv
                for tags in tagslist:
                    for tag in tags:
                        tag = tag.strip()
                        categories.add(tag)
                        if (cat_count.has_key(tag)):
                            cat_count[tag] += 1
                        else:
                            cat_count[tag] = 1
        sorted_cat_dict = sorted(cat_count, key=cat_count.get, reverse=True)

        with open(config.corpus_path + "srl_Freqwithcount.categories",
                  'w') as wf:
            for i, w in enumerate(sorted_cat_dict):
                print w, cat_count.get(w)
                wf.write(w + " " + str(cat_count.get(w)) + '\n')
示例#2
0
 def test_write_srl_labels(self):
     infilenames = [config.corpus_path+'srl_iob.train', config.corpus_path+'srl_iob.dev', config.corpus_path+'srl_iob.test']
     outfilenames = [config.corpus_path+'srl_labels.train', config.corpus_path+'srl_labels.dev', config.corpus_path+'srl_labels.test'] 
     
     #create categories file, remove infrequent categories
     categories = set() 
     cat_count = dict()     
     for filename in infilenames:  
         sentences_tags_verbs = read_srl(filename)        
         for stv in sentences_tags_verbs:
             _,tagslist,_ = stv
             for tags in tagslist:
                 for tag in tags:
                     tag = tag.strip()
                     categories.add(tag)
                     if(cat_count.has_key(tag)):
                         cat_count[tag] += 1
                     else:
                         cat_count[tag] = 1
     sorted_cat_dict = sorted(cat_count, key=cat_count.get, reverse=True)
     
         
     with open(config.corpus_path+"srl_Freqwithcount.categories", 'w') as wf:
         for i,w in enumerate(sorted_cat_dict):
             print w, cat_count.get(w)
             wf.write(w+" "+str(cat_count.get(w))+'\n')
示例#3
0
def get_features(srl_iob_file, chunks_tags_file):

    outfile = open(config.corpus_path + "srl_vec_features.train", 'w')
    phnlabels = open(config.corpus_path + "srl_phrases_labels.train", 'w')

    mats = sio.loadmat(config.corpus_path + 'vars.normalized.100.mat')
    We_orig = mats.get('We')
    words = mats.get('words')

    words = words.flatten()
    keys = [str(words[i][0]).strip() for i in range(len(words))]
    values = range(len(words))
    word_dict = dict(zip(keys, values))

    params = sio.loadmat(config.corpus_path + 'params_rae.mat')
    W1 = params.get('W1')
    W2 = params.get('W2')
    b1 = params.get('b1')
    new_sents, _ = get_sent_phrases_srl(srl_iob_file, chunks_tags_file)
    sentences_tags_verbs = read_srl(srl_iob_file)
    for new_sent, sentence_tags_verbs in zip(new_sents, sentences_tags_verbs):
        sent, taglists, verbIds = sentence_tags_verbs
        for i, verbId in enumerate(verbIds):
            tags = taglists[i]
            offset = 0
            for wordOrPhrase in new_sent:
                try:
                    #                    wpvec = get_phrase_vector(wordOrPhrase, W1, W2, b1, We_orig, word_dict)
                    #                    verbVec = get_phrase_vector(sent[verbId], W1, W2, b1, We_orig, word_dict)
                    label = tags[offset]
                    offset += len(wordOrPhrase)
                    #                    row = " ".join([str(x) for x in wpvec]) + "\t" + " ".join([str(x) for x in verbVec]) \
                    #                            + "\t"+ label + '\n'
                    #                    outfile.write(row)
                    phnlabels.write(" ".join(wordOrPhrase) + "\t" + label +
                                    "\n")
                except:
                    print
    outfile.close()
示例#4
0
文件: rae.py 项目: 5idaidai/MVRNN
def get_features(srl_iob_file, chunks_tags_file):
    
    outfile = open(config.corpus_path+"srl_vec_features.train", 'w')
    phnlabels = open(config.corpus_path+"srl_phrases_labels.train", 'w')
    
    mats = sio.loadmat(config.corpus_path+'vars.normalized.100.mat')    
    We_orig = mats.get('We')
    words = mats.get('words')
    
    words = words.flatten()
    keys = [str(words[i][0]).strip() for i in range(len(words))]
    values = range(len(words))
    word_dict = dict(zip(keys, values))
    
    params = sio.loadmat(config.corpus_path+'params_rae.mat')
    W1 = params.get('W1')
    W2 = params.get('W2')
    b1 = params.get('b1')
    new_sents, _ = get_sent_phrases_srl(srl_iob_file, chunks_tags_file)
    sentences_tags_verbs = read_srl(srl_iob_file)
    for new_sent, sentence_tags_verbs in zip(new_sents, sentences_tags_verbs):
        sent, taglists, verbIds = sentence_tags_verbs
        for i, verbId in enumerate(verbIds):
            tags = taglists[i]
            offset = 0
            for wordOrPhrase in new_sent:
                try:
#                    wpvec = get_phrase_vector(wordOrPhrase, W1, W2, b1, We_orig, word_dict)
#                    verbVec = get_phrase_vector(sent[verbId], W1, W2, b1, We_orig, word_dict)
                    label = tags[offset]
                    offset += len(wordOrPhrase)
#                    row = " ".join([str(x) for x in wpvec]) + "\t" + " ".join([str(x) for x in verbVec]) \
#                            + "\t"+ label + '\n'
#                    outfile.write(row)
                    phnlabels.write(" ".join(wordOrPhrase)+ "\t" + label + "\n")
                except:
                    print
    outfile.close()
示例#5
0
def get_sent_phrases_srl(srl_iob_file, chunk_tags_file):
    ''' returns list of new sentences(reduced tokens) and list of phrases for each sentence.
        In list of new sentences, each new sentence contains either the single word or the phrase
        '''

    postags, chktags = read_sent_chunktags(chunk_tags_file)
    sentences_tags_verbs = read_srl(srl_iob_file)
    new_sent_tags_verbs = []
    new_sent_pos_chk = []
    for s, stv in enumerate(sentences_tags_verbs):
        sent, tagsList, verbIds = stv
        chktag = chktags[s]
        postag = postags[s]
        new_sent = []

        #construct new sent
        i = 0
        while (i < len(chktag) - 1):
            thistag = chktag[i]
            nexttag = chktag[i + 1]
            if (thistag.startswith('B-') and nexttag.startswith('I-')
                    and thistag != 'B-VP'):
                phrase = [sent[i]]
                j = i + 1
                while (j < len(chktag) and chktag[j].startswith('I-')):
                    phrase.append(sent[j])
                    j += 1
                new_sent.append(phrase)
                i = j

            else:
                new_sent.append([sent[i]])
                i += 1

            if (i == len(chktag) - 1):
                new_sent.append([sent[i]])

        #construct new pos n chk tags
        offset = 0
        new_posTags = []
        new_chkTags = []
        for i, phrase in enumerate(new_sent):
            offset += len(phrase)
            new_posTags.append(postag[offset - 1])
            new_chkTags.append(chktag[offset - 1])
        #construct  verbIds for this new sent
        new_verbIds = []
        for verbId in verbIds:
            offset = 0
            for i, phrase in enumerate(new_sent):
                if (offset == verbId):
                    new_verbIds.append(i)
                    break
                offset += len(phrase)
        #construct tags
        new_tagsList = []
        for nv, verbId in enumerate(verbIds):
            new_tags = []
            tags = tagsList[nv]
            offset = 0
            for i, phrase in enumerate(new_sent):
                new_tags.append(tags[offset])
                offset += len(phrase)
            new_tagsList.append(new_tags)

        new_sent_tags_verbs.append((new_sent, new_tagsList, new_verbIds))
        new_sent_pos_chk.append((new_posTags, new_chkTags))


#        all_new_sents.append(new_sent)
#        all_new_verbIds.append(new_verbIds)
#        all_new_tags.append(new_tagsList)

    return new_sent_tags_verbs, new_sent_pos_chk
示例#6
0
 def test_read_srl(self):
     filename = '/home/bhanu/workspace/MVRNN/data/corpus/srl_iob.dev'
     sentences_tags_verbs_ori = read_srl(filename)
     sentences_tags_verbs_pred = read_srl()
示例#7
0
 def test_read_srl(self):
     filename = '/home/bhanu/workspace/MVRNN/data/corpus/srl_iob.dev'
     sentences_tags_verbs_ori = read_srl(filename)
     sentences_tags_verbs_pred = read_srl()
示例#8
0
文件: rae.py 项目: 5idaidai/MVRNN
def get_sent_phrases_srl(srl_iob_file, chunk_tags_file):
    ''' returns list of new sentences(reduced tokens) and list of phrases for each sentence.
        In list of new sentences, each new sentence contains either the single word or the phrase
        '''

    postags, chktags = read_sent_chunktags(chunk_tags_file)
    sentences_tags_verbs = read_srl(srl_iob_file)
    new_sent_tags_verbs = []
    new_sent_pos_chk = []
    for s, stv in enumerate(sentences_tags_verbs):
        sent, tagsList, verbIds = stv
        chktag = chktags[s]
        postag = postags[s]
        new_sent = [] 
        
        #construct new sent
        i = 0
        while( i < len(chktag) - 1):
            thistag = chktag[i]
            nexttag = chktag[i+1]
            if(thistag.startswith('B-') and  nexttag.startswith('I-')
               and thistag != 'B-VP'):
                phrase = [sent[i]] 
                j = i+1
                while(j < len(chktag) and  chktag[j].startswith('I-')):
                    phrase.append(sent[j])
                    j += 1
                new_sent.append(phrase)                    
                i = j
                
            else:
                new_sent.append([sent[i]])
                i += 1
                    
            if(i == len(chktag) -1):
                new_sent.append([sent[i]])
        
        #construct new pos n chk tags
        offset = 0
        new_posTags = []
        new_chkTags = []
        for i, phrase in enumerate(new_sent):
            offset+=len(phrase)
            new_posTags.append(postag[offset-1]) 
            new_chkTags.append(chktag[offset-1])        
        #construct  verbIds for this new sent
        new_verbIds = []
        for verbId in verbIds:
            offset = 0
            for i, phrase in enumerate(new_sent):
                if(offset == verbId):
                    new_verbIds.append(i)
                    break
                offset += len(phrase)
        #construct tags
        new_tagsList =[]
        for nv, verbId in enumerate(verbIds):
            new_tags = []
            tags = tagsList[nv]
            offset = 0
            for i, phrase in enumerate(new_sent):
                new_tags.append(tags[offset])
                offset += len(phrase)
            new_tagsList.append(new_tags)
        
        new_sent_tags_verbs.append((new_sent, new_tagsList, new_verbIds))   
        new_sent_pos_chk.append((new_posTags, new_chkTags))     
#        all_new_sents.append(new_sent)  
#        all_new_verbIds.append(new_verbIds)
#        all_new_tags.append(new_tagsList)      
    
    return new_sent_tags_verbs, new_sent_pos_chk