Python FileUtil.read примеры использования

Язык программирования: Python

Пространство имен/Пакет: com.utils.fileutil

Класс/Тип: FileUtil

Метод/Функция: read

Примеров на hotexamples.com: 4

Python FileUtil.read - 4 примера найдено. Это лучшие примеры Python кода для com.utils.fileutil.FileUtil.read, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

mkdirs(8)

isempty(7)

isexist(6)

listdir(4)

read(3)

write(2)

empty(1)

getfilename(1)

getparentdir(1)

Пример #1

Показать файл

 def cal_weight_improve(self, key_words, class_label):
     """
     计算获取特征词后的权重信息
     :param key_words: [{'sentence': {}}, ...] or [{}, ...] 有可能是测试集数据有可能是训练集数据
     :return:
     """
     print "Cal Improve Weight: ", time.strftime('%Y-%m-%d %H:%M:%S')
     if not self.istrain:
         dir_ = os.path.join(TEXT_OUT, "key_words")
         filename = self.__class__.__name__ + ".txt" if self.subjective else self.__class__.__name__ + "_objective.txt"
         url = os.path.join(dir_, filename)
         train_key_words = FileUtil.read(url)
         train_class_label = [d.get("emotion-1-type") for d in train_key_words]
     else:
         train_key_words = key_words
         train_class_label = class_label
     train_key_words = [d.get("sentence") if "sentence" in d else d for d in train_key_words]
     key_words = [d.get("sentence") if "sentence" in d else d for d in key_words]
     # 获得 tf
     key_words = [{k: v / sum(d.values()) for k, v in d.items()} for d in key_words]
     fit_train_key_words = Feature_Hasher.transform(train_key_words)
     fit_key_words = Feature_Hasher.transform(key_words)
     tfidf = TfidfImprove()
     # 训练 idf
     tfidf.fit(fit_train_key_words, train_class_label)
     weight_matrix = tfidf.transform(fit_key_words, class_label)
     print "Cal Weight Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
     print
     return weight_matrix

Пример #2

Показать файл

Файл: check_feature.py Проект: zqlhuanying/Image_Emotion

def _check_feature_size(url):
    l = []
    for line in FileUtil.read(url):
        line = ",".join(line.get("sentence"))
        line = line.split(",")
        l.append(line)

    feature_size = set(flatten(l))
    return len(feature_size)

Пример #3

Показать файл

    def _get_splited_train(self):
        """
        优先从文件中读取训练集分词后的结果
        :return:
        """
        dir_ = os.path.join(TEXT_OUT, "split")
        if self.subjective:
            split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
            training_datas = Load.load_training_balance()
        else:
            split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
            training_datas = Load.load_training_objective_balance()

        if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt):
            # 加载训练集
            # 每个句子还包含类别信息
            splited_words_list = Feature.__split(flatten(training_datas))
#            splited_words_list = Feature.__del_low_frequency_word(splited_words_list)

            FileUtil.write(split_txt, splited_words_list)
        else:
            splited_words_list = FileUtil.read(split_txt)

        return splited_words_list

Пример #4

Показать файл

    def _collect(self, splited_words_list, sentence_size):
        dir_ = os.path.join(TEXT_OUT, "key_words")
        if self.subjective:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
        else:
            key_words_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
#        def norm(word_scores):
#            """
#            以样本为单位正则化
#            归一化（正则化）
#            Normalization 主要思想是对每个样本计算其p-范数，然后对该样本中每个元素除以该范数，
#            这样处理的结果是使得每个处理后样本的p-范数（l1-norm,l2-norm）等于1。
#
#            p-范数的计算公式：||X||p=(|x1|^p+|x2|^p+...+|xn|^p)^1/p
#
#            该方法主要应用于文本分类和聚类中。
#
#            :param word_scores: a dict {word: score}
#            """
#            p = 0.0
#            for v in word_scores.values():
#                p += math.pow(math.fabs(v), 2)
#            p = math.pow(p, 1.0 / 2)
#
#            for k, v in word_scores.items():
#                word_scores[k] = v / p

#        def reduce_dim(word_scores):
#            """
#            降维：选取累加权重信息占比超过 0.9 的特征词
#            """
#            _size = len(word_scores)
#            _max = math.pow(_size, 1.0 / 2) * 0.85
#            res = {}
#            # 降序排序
#            sort = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
#            _sum = 0.0
#            for k, v in sort:
#                if(_sum > _max):
#                    break
#                res[k] = v
#                _sum += v
#            return res

        if not self.istrain or self.f or not FileUtil.isexist(key_words_txt) or FileUtil.isempty(key_words_txt):
            print "Cal Scores: ", time.strftime('%Y-%m-%d %H:%M:%S')
            if len(splited_words_list) == sentence_size:
                train_range = slice(sentence_size)
            else:
                train_range = slice(sentence_size, len(splited_words_list))

            # 获取所有类别下的文本
            all_class_datas = Feature.all_class_text(splited_words_list[train_range], self.getclasses())

            # 获取类别标签
            class_label = [d.get("emotion-1-type") for d in splited_words_list[: sentence_size]]

            # return term/frequency or term/score
            res = []
            for splited_words_dict in splited_words_list[0: sentence_size]:
                splited_words = splited_words_dict.get("sentence")
                label = splited_words_dict.get("emotion-1-type")
                # 计算每个单词的得分 scores: {word: [score, frequency], ...}
                scores = {splited_word: [self.cal_score(splited_word, splited_words, label, all_class_datas,
                                                        [d.get("sentence") for d in splited_words_list[train_range]]),
                                         frequency]
                          for splited_word, frequency in splited_words.items()}
                # 归一化
                # norm(scores)
                # 降维处理
                sorted_words = scores
#                if not self.istrain:
#                    sorted_words = reduce_dim(scores)

                # Collection
                # if False return term/score
                # if True  return term/frequency
#                if False:
#                    for k in sorted_words.keys():
#                        sorted_words[k] = splited_words.count(k)

                res.append({"sentence": sorted_words,
                            "emotion-1-type": splited_words_dict.get("emotion-1-type")})
            print "Cal Scores Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # FileUtil.write(TEST_BASE_URL + "scores.txt", res)
            print "Begin Normalization: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 归一化
            self.norm(res)
            # FileUtil.write(TEST_BASE_URL + "norm.txt", res)
            print "Normalization Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            print "Begin Reduce: ", time.strftime('%Y-%m-%d %H:%M:%S')
            # 降维
            self.reduce_dim(res)
            print "Reduce Done: ", time.strftime('%Y-%m-%d %H:%M:%S')

            # Try Convert term/score to term/frequency
            # if False return term/score
            # if True  return term/frequency
            for d in res:
                ws = d.get("sentence")
                for k, v in ws.items():
                    ws[k] = v[0]
                    if True:
                        ws[k] = v[1]

            # 由于分词或降维的过程中，有可能因为样本的信息关键词不够，
            # 使得该样本经过上诉步骤后为空，返回这类样本的索引
            danger_index = []
            res = filter(lambda x: danger_index.append(x[0]) if not x[1].get("sentence") else x,
                         enumerate(res))
            res = list(zip(*res)[1])

            class_label = [c for i, c in enumerate(class_label)
                           if i not in danger_index]

            # 写入文件
            if self.istrain:
                FileUtil.write(key_words_txt, res)
        else:
            res = FileUtil.read(key_words_txt)
            class_label = [r["emotion-1-type"] for r in res]
            danger_index = []

        # 输出统计信息
        if False:
            self.__print_top_key_word(res)
        return res, class_label, danger_index