def append_data(src, dest, classes):
    """
    read class c from src, append result to dest
    :param src: url
    :param dest: url
    :param classes: classes a list
    :return:
    """
    for c in classes:
        if c not in EMOTION_CLASS.keys():
            raise ValueError("%s is not support class" % c)

    src_tree = None
    dest_tree = None
    try:
        src_tree = ET.parse(src)
        dest_tree = ET.parse(dest)
    except IOError:
        print "cannot parse file"
        exit(-1)

    if src_tree and dest_tree:
        src_root = src_tree.getroot()
        dest_root = dest_tree.getroot()

        l = [src_root.findall("weibo[@emotion-type='%s']" % c) for c in classes]
        l = flatten(l)
        random.shuffle(l)

        [dest_root.append(l1) for l1 in l]

        # write to file
        dest_tree.write(dest, encoding="utf-8")

        print "append data is done."
示例#2
0
    def __load(url, ratio, direction=True, subjective=True, balance=False):
        """
        Loading Training Data Except Objective Sentence
        :param url:
        :param direction: 默认从上往下取
        :param subjective: 加载主观句还是客观句,默认加载主观数据
                            True: 加载多类别,即情绪标签
                            False: 加载二类别,即主客观
        :param balance: 是否需要平衡加载数据集,默认以非平衡的方式加载
        :return:
        """
        # 若是加载客观的数据,也就没有平衡加载的概念
#        if not subjective and balance:
#            raise AttributeError("can not load data which is objective and use balanced way!")

        tree = None
        try:
            tree = ET.parse(url)
        except IOError:
            print "cannot parse file"
            exit(-1)
        if tree is not None:
            # get the root
            root = tree.getroot()

            # get the direct child
            # 若非平衡加载,只需要将所有的 weibo 看成一类,即可复用代码
            # todo
            # ElementTree XPath 貌似不支持 not、!= 操作,所有暂时采用以下方案代替
            each_class = [[sentence for sentence in root.findall("weibo") if sentence.get("emotion-type") != "none"]]
            if not subjective:
                each_class = [root.findall("weibo[@emotion-type]")]

            if balance:
                each_class = [root.findall("weibo[@emotion-type='%s']" % c) for c in EMOTION_CLASS.keys()]
                if not subjective:
                    each_class = Load.partition(root.findall("weibo[@emotion-type]"),
                                                lambda x: x.get("emotion-type") == "none")

            each_class_size = [len(c) for c in each_class]
            each_class_range = [slice(int(n * ratio)) for n in each_class_size]
            if not direction:
                _reverse_ratio = 1 - ratio
                each_class_range = [slice(int(n * _reverse_ratio), n) for n in each_class_size]

            sentences = []
            for i, each in enumerate(each_class):
                _range = each_class_range[i]
                sentences.append([Load.integrate(sentence) for sentence in each[_range]])

            # shuffle
            sentences = flatten(sentences)
            # random.shuffle(sentences)

            return [{"sentence": sentence.text.encode("utf_8"),
                     "emotion-tag": sentence.get("emotion_tag"),
                     "emotion-1-type": sentence.get("emotion-type"),
                     "emotion-2-type": sentence.get("emotion-2-type")}
                    for sentence in sentences]
示例#3
0
def append_data(src, dest, classes):
    """
    read class c from src, append result to dest
    :param src: url
    :param dest: url
    :param classes: classes a list
    :return:
    """
    for c in classes:
        if c not in EMOTION_CLASS.keys():
            raise ValueError("%s is not support class" % c)

    src_tree = None
    dest_tree = None
    try:
        src_tree = ET.parse(src)
        dest_tree = ET.parse(dest)
    except IOError:
        print "cannot parse file"
        exit(-1)

    if src_tree and dest_tree:
        src_root = src_tree.getroot()
        dest_root = dest_tree.getroot()

        l = [
            src_root.findall("weibo[@emotion-type='%s']" % c) for c in classes
        ]
        l = flatten(l)
        random.shuffle(l)

        [dest_root.append(l1) for l1 in l]

        # write to file
        dest_tree.write(dest, encoding="utf-8")

        print "append data is done."
示例#4
0
 def getclasses(self):
     return EMOTION_CLASS.keys()
示例#5
0
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf0 = Classification()
            clf0.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
        test_label = np.asanyarray(class_label)[test_index].tolist()

    method_options = ("second", "four", "five")
    method_options_0 = ("B", "C", "D")
    linestyle = (':', '--', '-')
    plot.get_instance()
    for i in range(len(method_options)):
        bayes = IncrBayes()
        clf = Classification(bayes=bayes)
        clf.get_classificator(train, class_label, iscrossvalidate=crossvalidate,
                              isbalance=False, minority_target=EMOTION_CLASS.keys())
#        clf.get_classificator(train, class_label, isbalance=True, minority_target=["anger", "fear", "surprise"])
        if(i == 0):
            pred = clf.predict(test)
            pred_unknow = clf.predict_unknow(test)

            print "origin precision:", clf.metrics_precision(test_label, pred_unknow)
            print "origin recall:", clf.metrics_recall(test_label, pred_unknow)
            print "origin f1:", clf.metrics_f1(test_label, pred_unknow)
            print "origin accuracy:", clf.metrics_accuracy(test_label, pred_unknow)
            print "origin zero_one_loss:", clf.metrics_zero_one_loss(test_label, pred_unknow)
            test_proba = clf.predict_max_proba(test)
            print "origin my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba)
            print
            clf.metrics_correct(test_label, pred_unknow)
#            plot.plot_roc(test_label, clf.predict_proba(test), classes=clf.bayes.classes_.tolist(), text='origin')
示例#6
0
    def __load(url, ratio, direction=True, subjective=True, balance=False):
        """
        Loading Training Data Except Objective Sentence
        :param url:
        :param direction: 默认从上往下取
        :param subjective: 加载主观句还是客观句,默认加载主观数据
                            True: 加载多类别,即情绪标签
                            False: 加载二类别,即主客观
        :param balance: 是否需要平衡加载数据集,默认以非平衡的方式加载
        :return:
        """
        # 若是加载客观的数据,也就没有平衡加载的概念
        #        if not subjective and balance:
        #            raise AttributeError("can not load data which is objective and use balanced way!")

        tree = None
        try:
            tree = ET.parse(url)
        except IOError:
            print "cannot parse file"
            exit(-1)
        if tree is not None:
            # get the root
            root = tree.getroot()

            # get the direct child
            # 若非平衡加载,只需要将所有的 weibo 看成一类,即可复用代码
            # todo
            # ElementTree XPath 貌似不支持 not、!= 操作,所有暂时采用以下方案代替
            each_class = [[
                sentence for sentence in root.findall("weibo")
                if sentence.get("emotion-type") != "none"
            ]]
            if not subjective:
                each_class = [root.findall("weibo[@emotion-type]")]

            if balance:
                each_class = [
                    root.findall("weibo[@emotion-type='%s']" % c)
                    for c in EMOTION_CLASS.keys()
                ]
                if not subjective:
                    each_class = Load.partition(
                        root.findall("weibo[@emotion-type]"),
                        lambda x: x.get("emotion-type") == "none")

            each_class_size = [len(c) for c in each_class]
            each_class_range = [slice(int(n * ratio)) for n in each_class_size]
            if not direction:
                _reverse_ratio = 1 - ratio
                each_class_range = [
                    slice(int(n * _reverse_ratio), n) for n in each_class_size
                ]

            sentences = []
            for i, each in enumerate(each_class):
                _range = each_class_range[i]
                sentences.append(
                    [Load.integrate(sentence) for sentence in each[_range]])

            # shuffle
            sentences = flatten(sentences)
            # random.shuffle(sentences)

            return [{
                "sentence": sentence.text.encode("utf_8"),
                "emotion-tag": sentence.get("emotion_tag"),
                "emotion-1-type": sentence.get("emotion-type"),
                "emotion-2-type": sentence.get("emotion-2-type")
            } for sentence in sentences]
示例#7
0
 def getclasses(self):
     if self.subjective:
         classes = EMOTION_CLASS.keys()
     else:
         classes = OBJECTIVE_CLASS.keys()
     return classes
示例#8
0
    clf = Classification()
    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
        test_label = np.asanyarray(class_label)[test_index].tolist()
    clf.get_classificator(train,
                          class_label,
                          iscrossvalidate=crossvalidate,
                          isbalance=False,
                          minority_target=EMOTION_CLASS.keys())

    pred = clf.predict(test)
    pred_unknow = clf.predict_unknow(test)
    #    print pred
    print "precision:", clf.metrics_precision(test_label, pred_unknow)
    print "recall:", clf.metrics_recall(test_label, pred_unknow)
    print "f1:", clf.metrics_f1(test_label, pred_unknow)
    print "accuracy:", clf.metrics_accuracy(test_label, pred_unknow)
    print "zero_one_loss:", clf.metrics_zero_one_loss(test_label, pred_unknow)
    test_proba = clf.predict_max_proba(test)
    print "my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba)
    print
    clf.metrics_correct(test_label, pred_unknow)
    plot.get_instance()
    plot.plot_roc(test_label,
示例#9
0
 def __each_class_text(datas, c):
     # 获取 datas 下,类别 c 的文本
     if c not in EMOTION_CLASS.keys():
         raise ValueError("have no emotion class")
     return [data.get("sentence") for data in datas if data.get("emotion-1-type") == c]
示例#10
0
 def getclasses(self):
     if self.subjective:
         classes = EMOTION_CLASS.keys()
     else:
         classes = OBJECTIVE_CLASS.keys()
     return classes
示例#11
0
 def getclasses(self):
     return EMOTION_CLASS.keys()