Пример #1
0
    def _read_entry(self, entry):
        ds = entry["datasource"].lower()
        if ds == 'wikimwe':
            words = entry["lemmas"]  # not actually always lemmatized
            del entry["lemmas"]
            entry["words"] = words
        if ds in self.POS_2_PENN:  # map POSes to Penn Treebank tagset
            for i, p in enumerate(entry["poses"]):
                info = self.POS_2_PENN[ds].get(p, p)
                entry["poses"][i] = info if isinstance(
                    info, basestring) else (info.get(
                        entry["lemmas" if ds == 'baldwin vpc' else "words"][i])
                                            or info[None])

        if "lemmas" not in entry:

            if 'lvc' in ds:
                entry["lemmas"] = [
                    entry["verblemma"],
                    morph.stem(entry["noun"], 'NN')
                ]
            else:
                assert "words" in entry, entry
                words = entry["words"]
                poses = [None] * len(words)
                if "poses" in entry and entry["poses"]:
                    assert ds in {'said', 'semcor', 'wikimwe'}, entry
                    poses = entry["poses"]
                elif ds in {'phrases.net', "oyz's idioms"}:
                    pass
                elif entry["label"].startswith(
                        'NNP') or entry["label"].startswith('NE:'):
                    poses = ['NNP'] * len(words)
                entry["lemmas"] = [
                    morph.stem(w, p) for w, p in zip(words, poses)
                ]
        try:
            sig = tuple(l.lower() for l in entry["lemmas"]
                        if not l[0] == l[-1] == '_')
            if not sig or sig[-1] == 'the' or not any(
                    l for l in sig if len(l) > 2):
                return  # probably garbage entry
            if len(sig) > 1:
                self._entries[sig] = entry
                self._bylast[sig[-1]].add(sig)
        except:
            print(entry, file=sys.stderr)
            raise
Пример #2
0
    def _read_nonblank_line(self, ln, sent):
        '''Tab-separated format:
        word   pos   tag   sentId
        tag and sentId are optional.
        '''
        parts = ln[:-1].split('\t')[:4]
        if len(parts) == 4:
            token, pos, tag, sentId = parts
            sent.sentId = sentId
        elif len(parts) == 3:
            token, pos, tag = parts
            if not tag.strip():
                tag = None
        else:
            token, pos = parts
            tag = None

        if tag is not None:
            if self._labels is None:
                pass
            elif tag == '0' and self._legacy0:
                assert 'O' in self._labels, self._labels
                tag = 'O'
            elif tag not in self._labels:
                tag = 'O'
            tag = uintern(unicode(tag))

        pos = uintern(unicode(pos))
        stemS = uintern(unicode(morph.stem(token, pos)))
        sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag)
 def _read_nonblank_line(self, ln, sent):
     '''Tab-separated format:
     word   pos   tag   sentId
     tag and sentId are optional.
     '''
     parts = ln[:-1].split('\t')[:4]
     if len(parts)==4:
         token, pos, tag, sentId = parts
         sent.sentId = sentId
     elif len(parts)==3:
         token, pos, tag = parts
         if not tag.strip():
             tag = None
     else:
         token, pos = parts
         tag = None
     
     
     if tag is not None:
         if self._labels is None:
             pass
         elif tag=='0' and self._legacy0:
             assert 'O' in self._labels,self._labels
             tag = 'O'
         elif tag not in self._labels:
             tag = 'O'
         tag = uintern(unicode(tag))
         
     pos = uintern(unicode(pos))
     stemS = uintern(unicode(morph.stem(token,pos)))
     sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag)
 def _read_nonblank_line(self, ln, sent):
     '''Tab-separated format:
     offset   word   lemma   POS   tag   parent   strength   label   sentId
     lemma will (for now) be ignored in favor of the automatic stemmer.
     label may be the empty string; sentId is optional.
     '''
     parts = ln[:-1].split('\t')
     if len(parts)==9:
         offset, token, _, pos, tag, parent, strength, label, sentId = parts
         sent.sentId = sentId
     else:
         offset, token, _, pos, tag, parent, strength, label = parts
     
     offset = int(offset)
     parent = int(parent)
     assert len(sent)+1==offset
     assert parent<offset
     
     
     if tag is not None:
         if self._labels is None:
             pass
         elif tag=='0' and self._legacy0:
             assert 'O' in self._labels,self._labels
             tag = 'O'
         elif tag not in self._labels:
             tag = 'O'
         tag = uintern(unicode(tag))
         
     pos = uintern(unicode(pos))
     stemS = uintern(unicode(morph.stem(token,pos)))
     sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag, 
                   goldparent=int(parent), goldstrength=uintern(unicode(strength)), 
                   goldlabel=uintern(unicode(label)))
 def _read_entry(self, entry):
     ds = entry["datasource"].lower()
     if ds=='wikimwe':
         words = entry["lemmas"] # not actually always lemmatized
         del entry["lemmas"]
         entry["words"] = words
     if ds in self.POS_2_PENN:   # map POSes to Penn Treebank tagset
         for i,p in enumerate(entry["poses"]):
             info = self.POS_2_PENN[ds].get(p,p)
             entry["poses"][i] = info if isinstance(info,basestring) else (info.get(entry["lemmas" if ds=='baldwin vpc' else "words"][i]) or info[None])
     
     
     if "lemmas" not in entry:
         
         if 'lvc' in ds:
             entry["lemmas"] = [entry["verblemma"], morph.stem(entry["noun"],'NN')]
         else:
             assert "words" in entry,entry
             words = entry["words"]
             poses = [None]*len(words)
             if "poses" in entry and entry["poses"]:
                 assert ds in {'said','semcor','wikimwe'},entry
                 poses = entry["poses"]
             elif ds in {'phrases.net', "oyz's idioms"}:
                 pass
             elif entry["label"].startswith('NNP') or entry["label"].startswith('NE:'):
                 poses = ['NNP']*len(words)
             entry["lemmas"] = [morph.stem(w,p) for w,p in zip(words,poses)]
     try:
         sig = tuple(l.lower() for l in entry["lemmas"] if not l[0]==l[-1]=='_')
         if not sig or sig[-1]=='the' or not any(l for l in sig if len(l)>2):
             return    # probably garbage entry
         if len(sig)>1:
             self._entries[sig] = entry
             self._bylast[sig[-1]].add(sig)
     except:
         print(entry, file=sys.stderr)
         raise
Пример #6
0
    def _read_nonblank_line(self, ln, sent):
        '''Tab-separated format:
        offset   word   lemma   POS   tag   parent   strength   label   sentId
        lemma will (for now) be ignored in favor of the automatic stemmer.
        label may be the empty string; sentId is optional.
        '''
        parts = ln[:-1].split('\t')
        if len(parts) == 9:
            offset, token, _, pos, tag, parent, strength, label, sentId = parts
            sent.sentId = sentId
        else:
            offset, token, _, pos, tag, parent, strength, label = parts

        offset = int(offset)
        parent = int(parent)
        assert len(sent) + 1 == offset
        assert parent < offset

        if tag is not None:
            if self._labels is None:
                pass
            elif tag == '0' and self._legacy0:
                assert 'O' in self._labels, self._labels
                tag = 'O'
            elif tag not in self._labels:
                tag = 'O'
            tag = uintern(unicode(tag))

        pos = uintern(unicode(pos))
        stemS = uintern(unicode(morph.stem(token, pos)))
        sent.addToken(token=token,
                      stem=stemS,
                      pos=pos,
                      goldTag=tag,
                      goldparent=int(parent),
                      goldstrength=uintern(unicode(strength)),
                      goldlabel=uintern(unicode(label)))