def _apply(self, s, idxs=None):
     candidates = [c for c in super(PrefixDictionaryMatcher, self)._apply(s,idxs)]
     for idxs,label in candidates:
         matched = False
         
         words = unescape_penn_treebank([s.words[i] for i in idxs])
         matched = reduce(lambda x,y:x or y,[t in self.dictionary for t in words])
         s = " ".join(words)
         if matched and (" in the " in s or " of the " in s):
             yield idxs, label
示例#2
0
    def _apply(self, s, idxs=None):
        candidates = [
            c for c in super(PrefixDictionaryMatcher, self)._apply(s, idxs)
        ]
        for idxs, label in candidates:
            matched = False

            words = unescape_penn_treebank([s.words[i] for i in idxs])
            matched = reduce(lambda x, y: x or y,
                             [t in self.dictionary for t in words])
            s = " ".join(words)
            if matched and (" in the " in s or " of the " in s):
                yield idxs, label
 def _apply(self, s, idxs=None):
     candidates = [c for c in super(PrefixDictionaryMatcher, self)._apply(s,idxs)]
     for idxs,label in candidates:
         matched = False
         
         words = unescape_penn_treebank([s.words[i] for i in idxs])
         matched = reduce(lambda x,y:x or y,[t in self.dictionary for t in words])
         '''
         for i in range(len(idxs)):
             for j in range(i+1,len(idxs)):
                 phrase = " ".join(unescape_penn_treebank(s.words[idxs[i]:idxs[j]+1]))
                 phrase = phrase.lower() if self.ignore_case else phrase
                 print phrase
                 if phrase in self.dictionary:
                     matched = True
                     #break
         '''
         if matched:
             yield idxs, label
def create_corpus_dict(corpus, setdef="training"):
    '''Create dictionary using annotated corpus data'''
    dev_set = list(itertools.chain.from_iterable([corpus.cv[setdef].keys() for setdef in [setdef]]))
    documents = [(doc_id,corpus[doc_id]["sentences"],corpus[doc_id]["tags"]) for doc_id in dev_set]
    
    print len(dev_set),len(corpus.documents)
    
    d = {}
    for pmid,doc,labels in documents:
        for i in range(0,len(doc)):
            for tag in labels[i]:
                mention = doc[i].words[tag[-1][0]:tag[-1][1]]
                v1 = "".join(unescape_penn_treebank(mention))
                v2 = tag[0].replace(" ","")
                if v1 != v2:
                    # problem with tokenization
                    #print " ".join(unescape_penn_treebank(mention)), tag
                    pass
                else:
                    d[" ".join(mention)] = 1                    
    return d
示例#5
0
    def _apply(self, s, idxs=None):
        candidates = [
            c for c in super(PrefixDictionaryMatcher, self)._apply(s, idxs)
        ]
        for idxs, label in candidates:
            matched = False

            words = unescape_penn_treebank([s.words[i] for i in idxs])
            matched = reduce(lambda x, y: x or y,
                             [t in self.dictionary for t in words])
            '''
            for i in range(len(idxs)):
                for j in range(i+1,len(idxs)):
                    phrase = " ".join(unescape_penn_treebank(s.words[idxs[i]:idxs[j]+1]))
                    phrase = phrase.lower() if self.ignore_case else phrase
                    print phrase
                    if phrase in self.dictionary:
                        matched = True
                        #break
            '''
            if matched:
                yield idxs, label
示例#6
0
def create_corpus_dict(corpus, setdef="training"):
    '''Create dictionary using annotated corpus data'''
    dev_set = list(
        itertools.chain.from_iterable(
            [corpus.cv[setdef].keys() for setdef in [setdef]]))
    documents = [(doc_id, corpus[doc_id]["sentences"], corpus[doc_id]["tags"])
                 for doc_id in dev_set]

    print len(dev_set), len(corpus.documents)

    d = {}
    for pmid, doc, labels in documents:
        for i in range(0, len(doc)):
            for tag in labels[i]:
                mention = doc[i].words[tag[-1][0]:tag[-1][1]]
                v1 = "".join(unescape_penn_treebank(mention))
                v2 = tag[0].replace(" ", "")
                if v1 != v2:
                    # problem with tokenization
                    #print " ".join(unescape_penn_treebank(mention)), tag
                    pass
                else:
                    d[" ".join(mention)] = 1
    return d