def _apply(self, s, idxs=None): candidates = [c for c in super(PrefixDictionaryMatcher, self)._apply(s,idxs)] for idxs,label in candidates: matched = False words = unescape_penn_treebank([s.words[i] for i in idxs]) matched = reduce(lambda x,y:x or y,[t in self.dictionary for t in words]) s = " ".join(words) if matched and (" in the " in s or " of the " in s): yield idxs, label
def _apply(self, s, idxs=None): candidates = [ c for c in super(PrefixDictionaryMatcher, self)._apply(s, idxs) ] for idxs, label in candidates: matched = False words = unescape_penn_treebank([s.words[i] for i in idxs]) matched = reduce(lambda x, y: x or y, [t in self.dictionary for t in words]) s = " ".join(words) if matched and (" in the " in s or " of the " in s): yield idxs, label
def _apply(self, s, idxs=None): candidates = [c for c in super(PrefixDictionaryMatcher, self)._apply(s,idxs)] for idxs,label in candidates: matched = False words = unescape_penn_treebank([s.words[i] for i in idxs]) matched = reduce(lambda x,y:x or y,[t in self.dictionary for t in words]) ''' for i in range(len(idxs)): for j in range(i+1,len(idxs)): phrase = " ".join(unescape_penn_treebank(s.words[idxs[i]:idxs[j]+1])) phrase = phrase.lower() if self.ignore_case else phrase print phrase if phrase in self.dictionary: matched = True #break ''' if matched: yield idxs, label
def create_corpus_dict(corpus, setdef="training"): '''Create dictionary using annotated corpus data''' dev_set = list(itertools.chain.from_iterable([corpus.cv[setdef].keys() for setdef in [setdef]])) documents = [(doc_id,corpus[doc_id]["sentences"],corpus[doc_id]["tags"]) for doc_id in dev_set] print len(dev_set),len(corpus.documents) d = {} for pmid,doc,labels in documents: for i in range(0,len(doc)): for tag in labels[i]: mention = doc[i].words[tag[-1][0]:tag[-1][1]] v1 = "".join(unescape_penn_treebank(mention)) v2 = tag[0].replace(" ","") if v1 != v2: # problem with tokenization #print " ".join(unescape_penn_treebank(mention)), tag pass else: d[" ".join(mention)] = 1 return d
def _apply(self, s, idxs=None): candidates = [ c for c in super(PrefixDictionaryMatcher, self)._apply(s, idxs) ] for idxs, label in candidates: matched = False words = unescape_penn_treebank([s.words[i] for i in idxs]) matched = reduce(lambda x, y: x or y, [t in self.dictionary for t in words]) ''' for i in range(len(idxs)): for j in range(i+1,len(idxs)): phrase = " ".join(unescape_penn_treebank(s.words[idxs[i]:idxs[j]+1])) phrase = phrase.lower() if self.ignore_case else phrase print phrase if phrase in self.dictionary: matched = True #break ''' if matched: yield idxs, label
def create_corpus_dict(corpus, setdef="training"): '''Create dictionary using annotated corpus data''' dev_set = list( itertools.chain.from_iterable( [corpus.cv[setdef].keys() for setdef in [setdef]])) documents = [(doc_id, corpus[doc_id]["sentences"], corpus[doc_id]["tags"]) for doc_id in dev_set] print len(dev_set), len(corpus.documents) d = {} for pmid, doc, labels in documents: for i in range(0, len(doc)): for tag in labels[i]: mention = doc[i].words[tag[-1][0]:tag[-1][1]] v1 = "".join(unescape_penn_treebank(mention)) v2 = tag[0].replace(" ", "") if v1 != v2: # problem with tokenization #print " ".join(unescape_penn_treebank(mention)), tag pass else: d[" ".join(mention)] = 1 return d