예제 #1
0
    def test_single2(self):
        sp = unicode(os.environ["SENNAPATH"])
        sp = u"/Users/tuxedocat/Research/tools/senna/"
        parser = SennaParser(sp)
        txt = open("/Users/tuxedocat/Documents/workspace/Nyanco/sandbox/recognize100.txt").read().split("\n")
        txt = [s for s in txt if not s == ""]
        # testdata = open("/Users/tuxedocat/Documents/workspace/precure/src/test/sennatags.txt").read()
        testdata = []
        for s in txt:
            testdata.append(parser.parseSentence(s))
        for t in testdata:
            fe = SentenceFeatures(t, "recognize")
            fe.ngrams(n=5)
            fe.chunk()
            # fe.dependency()
            # fe.ne()
            # fe.bcv()
            # fe.srl()
            logging.debug(pformat((fe.SUF, fe.CHK, fe.NER)))
            logging.debug(" ".join(fe.SUF))
            logging.debug(fe.v_idx)
            logging.debug(pformat(fe.features))

            vec = DictVectorizer(sparse=True)
            array_f = vec.fit_transform(fe.features).toarray()
            # logging.debug(pformat(array_f))

        raise Exception
예제 #2
0
 def test_with_offset(self):
     self.testdata = [doc.split("\n") for doc in open(self.testpath_off).read().split("\n\n") if doc]
     fv = []
     # print pformat(self.testdata)
     for t in self.testdata:
         fe = SentenceFeatures(t)
         fe.length()
         fe.bow()
         logging.debug(pformat(zip(fe.SUF, fe.POS)))
         logging.debug(pformat(fe.OFFSET))
         logging.debug(pformat(fe.features))
         fv.append(fe.features)
     vec = DictVectorizer(sparse=True)
     array_f = vec.fit_transform(fv).toarray()
     logging.debug(pformat(array_f))
     raise Exception
예제 #3
0
 def test_with_offset(self):
     self.testdata = [
         doc.split("\n")
         for doc in open(self.testpath_off).read().split("\n\n") if doc
     ]
     fv = []
     # print pformat(self.testdata)
     for t in self.testdata:
         fe = SentenceFeatures(t)
         fe.length()
         fe.bow()
         logging.debug(pformat(zip(fe.SUF, fe.POS)))
         logging.debug(pformat(fe.OFFSET))
         logging.debug(pformat(fe.features))
         fv.append(fe.features)
     vec = DictVectorizer(sparse=True)
     array_f = vec.fit_transform(fv).toarray()
     logging.debug(pformat(array_f))
     raise Exception
예제 #4
0
 def _get_features_tgt(self, v_corpus=None, cls2id=None, domain="tgt"):
     _flist = []
     _labellist_int = []
     _labellist_str = []
     for sid, sdic in enumerate(v_corpus):
         v = sdic["label_corr"]
         _labelid = cls2id[v]
         try:
             fe = SentenceFeatures(sdic["parsed_corr"], verb=v, v_idx=sdic["vidx_corr"])
             if "chunk" in self.featuretypes:
                 fe.chunk()
             if "3gram" in self.featuretypes:
                 fe.ngrams(n=3)
             if "5gram" in self.featuretypes:
                 fe.ngrams(n=5)
             if "7gram" in self.featuretypes:
                 fe.ngrams(n=7)
             if "dep" in self.featuretypes:
                 fe.dependency()
             if "srl" in self.featuretypes:
                 fe.srl()
             if "ne" in self.featuretypes:
                 fe.ne()
             if "errorprob" in self.featuretypes:
                 pass
             if "topic" in self.featuretypes:
                 pass
             augf = proc_easyadapt(fe.features, domain=domain)
             assert augf and _labelid and v
             _flist.append(augf)
             _labellist_int.append(_labelid)
             _labellist_str.append(v)
         except ValueError:
             logging.debug(pformat("CaseMaker feature extraction: couldn't find the verb"))
         except:
             print v
     # else:
     # _flist.append(self.nullfeature)
     # _labellist_int.append(_labelid)
     # _labellist_str.append(v)
     return _flist, _labellist_str, _labellist_int
예제 #5
0
def get_features(tags=[], v="", v_idx=None, features=[]):
    fe = SentenceFeatures(tags=tags, verb=v, v_idx=v_idx)
    if "chunk" in features:
        fe.chunk()
    if "3gram" in features:
        fe.ngrams(n=3)
    if "5gram" in features:
        fe.ngrams(n=5)
    if "7gram" in features:
        fe.ngrams(n=7)
    if "dependency" in features:
        fe.dependency()
    if "ne" in features:
        fe.ne()
    if "srl" in features:
        fe.srl()
    if "topic" in features:
        fe.topic()
    if "errorprob" in features:
        fe.ep()
    # print pformat(fe.features)
    return proc_easyadapt(fe.features, domain="tgt")