Пример #1
0
    def load__tag_to_words_count__dict(self, model_cache, documents_with_features):
        print "load item_id_to_tags__dict ..."
        item_id_to_tags__dict = {item_id1: self.filter_valid_tags(model_cache.tags_model__extract_tags(item1))
                                 for item_id1, item1 in process_notifier(model_cache)}

        print "load tag_to_words_count__dict ..."
        tag_to_words_count__dict = defaultdict(lambda: defaultdict(lambda: 0))
        test_item_ids = model_cache.test_item_ids()
        for item_id1, words_count1 in process_notifier(documents_with_features):
            if item_id1 in test_item_ids:
                continue
            for tag1 in item_id_to_tags__dict[item_id1]:
                for word1, count1 in words_count1.iteritems():
                    tag_to_words_count__dict[tag1][word1] += count1
        return tag_to_words_count__dict
Пример #2
0
    def divided_into_two_parts(self):
        """ 重复毕竟是小部分, 所以_set只存可能是重复的ID列表 """
        candidate_set = set([])
        uniq_set = set([])

        feature1_dict = dict(self.default_features.items() +
                             self.custom_features.items())

        # group by 不支持范围,比如整数范围查询
        group_by_columns = [
            f1 for f1 in feature1_dict if feature1_dict[f1] == str
        ]

        if group_by_columns:
            table = self.features_tree
            group_by_query = [getattr(table, f1) for f1 in group_by_columns]
            group_concat = [
                peewee_fn.group_concat(table.item_id).alias('item_ids')
            ]

            group_by_sql = table.select(*(group_concat)).group_by(
                *group_by_query)
            for i1 in process_notifier(group_by_sql):
                items_len = len(i1.item_ids)
                if items_len > 24:
                    candidate_set = candidate_set | set(i1.item_ids.split(","))
                elif items_len == 24:  # 只有一个object_id
                    uniq_set.add(i1.item_ids)
                else:
                    raise Exception("item_ids is invalid")
        else:
            print feature1, "has none features"

        return (list(candidate_set), list(uniq_set))
Пример #3
0
    def divided_into_two_parts(self):
        """ 重复毕竟是小部分, 所以_set只存可能是重复的ID列表 """
        candidate_set = set([])
        uniq_set      = set([])

        feature1_dict = dict(self.default_features.items() + self.custom_features.items())

        # group by 不支持范围,比如整数范围查询
        group_by_columns = [f1 for f1 in feature1_dict if feature1_dict[f1] == str]

        if group_by_columns:
            table = self.features_tree
            group_by_query = [getattr(table, f1) for f1 in group_by_columns]
            group_concat   = [peewee_fn.group_concat(table.item_id).alias('item_ids')]

            group_by_sql   = table.select(*(group_concat)).group_by(*group_by_query)
            for i1 in process_notifier(group_by_sql):
                items_len = len(i1.item_ids)
                if items_len > 24:
                    candidate_set = candidate_set | set(i1.item_ids.split(","))
                elif items_len == 24: # 只有一个object_id
                    uniq_set.add(i1.item_ids)
                else:
                    raise Exception("item_ids is invalid")
        else:
            print feature1, "has none features"

        return (list(candidate_set), list(uniq_set))
Пример #4
0
    def mr_run(self):
        """ Overwrite BaseHadoopJobTask#run function. """
        # TODO maybe model cache
        map_kv_dict = defaultdict(list)

        inputs = self.input()
        if not isinstance(inputs, list):
            inputs = [inputs]
        for input_hdfs_1 in inputs:
            for line2 in TargetUtils.line_read(input_hdfs_1):
                for map_key_3, map_val_3 in self.mapper(line2):
                    map_kv_dict[map_key_3].append(map_val_3)

        with self.output().open("w") as output1:
            fixed_chunk = list()
            for reduce_key_2 in process_notifier(map_kv_dict.keys()):
                reduce_vals_2 = map_kv_dict[reduce_key_2]
                for _, reduce_val_2 in self.reducer(reduce_key_2,
                                                    reduce_vals_2):
                    fixed_chunk.append(reduce_val_2)

                    if len(fixed_chunk) % self.chunk_size == 0:
                        output1.write("\n".join(fixed_chunk) + "\n")
                        fixed_chunk = list()
                del map_kv_dict[reduce_key_2]
            output1.write("\n".join(fixed_chunk) + "\n")
Пример #5
0
    def mr_run(self):
        """ Overwrite BaseHadoopJobTask#run function. """
        # TODO maybe model cache
        map_kv_dict = defaultdict(list)

        inputs = self.input()
        if not isinstance(inputs, list):
            inputs = [inputs]
        for input_hdfs_1 in inputs:
            for line2 in TargetUtils.line_read(input_hdfs_1):
                for map_key_3, map_val_3 in self.mapper(line2):
                    map_kv_dict[map_key_3].append(map_val_3)

        with self.output().open("w") as output1:
            fixed_chunk = list()
            for reduce_key_2 in process_notifier(map_kv_dict.keys()):
                reduce_vals_2 = map_kv_dict[reduce_key_2]
                for _, reduce_val_2 in self.reducer(reduce_key_2, reduce_vals_2):
                    fixed_chunk.append(reduce_val_2)

                    if len(fixed_chunk) % self.chunk_size == 0:
                        output1.write("\n".join(fixed_chunk) + "\n")
                        fixed_chunk = list()
                del map_kv_dict[reduce_key_2]
            output1.write("\n".join(fixed_chunk) + "\n")
Пример #6
0
 def write(tmp_items):
     if self.output_lambda:
         self.output_lambda([i1[1] for i1 in tmp_items])
     else:
         for item_id, item1 in process_notifier(tmp_items):
             self.result[item_id] = item1
         self.result.sync()
     return []
Пример #7
0
 def filtered_item_ids_by_has_tags():
     print "[load from original data]\n"
     cls.pull_data()
     # 过滤掉 没有Tag的items
     return [
         item_id1 for item_id1, item1 in process_notifier(cls)
         if cls.tags_model__extract_tags(item1)
     ]
Пример #8
0
    def load__tag_to_words_count__dict(self, model_cache,
                                       documents_with_features):
        print "load item_id_to_tags__dict ..."
        item_id_to_tags__dict = {
            item_id1:
            self.filter_valid_tags(model_cache.tags_model__extract_tags(item1))
            for item_id1, item1 in process_notifier(model_cache)
        }

        print "load tag_to_words_count__dict ..."
        tag_to_words_count__dict = defaultdict(lambda: defaultdict(lambda: 0))
        test_item_ids = model_cache.test_item_ids()
        for item_id1, words_count1 in process_notifier(
                documents_with_features):
            if item_id1 in test_item_ids:
                continue
            for tag1 in item_id_to_tags__dict[item_id1]:
                for word1, count1 in words_count1.iteritems():
                    tag_to_words_count__dict[tag1][word1] += count1
        return tag_to_words_count__dict
Пример #9
0
    def load_features_with_weight(self, tag_to_features_count__dict):
        for tag1, features_counter1 in process_notifier(tag_to_features_count__dict):
            for node1 in self.name_to_nodes[tag1]:
                node1.features_weight = self.classify.calculate_features_weight(features_counter1)

                for f1 in node1.features_weight.keys():  # temp Fix
                    if f1 in self.classify.features_weight_machine.stop_words_set:
                        del node1.features_weight[f1]

                for feature1 in features_counter1:
                    self.feature_to_nodes[feature1].add(node1)
        return self
Пример #10
0
        def func():
            feature_in_doc_to_count = defaultdict(int)
            for item_id1, features in process_notifier(self.documents):
                for feature1 in features.iterkeys():
                    feature_in_doc_to_count[feature1] += 1

            idf_result = IdfResult()
            all_num = float(len(self.documents))

            for feature1, count1 in feature_in_doc_to_count.iteritems():
                idf_result[feature1] = math.log(all_num / count1)

            return idf_result
Пример #11
0
    def evaluate_effect(self, default_guess_count=10):
        if not self.is_run_test:
            print "没有设置is_run_test=True!"
            exit(0)

        test_items = self.model.test_items()
        print "要评估", len(test_items), "个条目"

        self.tags_tree.inspect('features_weight')

        result = [[self.model.tags_model__extract_tags(item1), self.recommend_tags(item1)]
                  for item1 in process_notifier(test_items)]

        def inspect_result(result, filter_item_ids=set([])):
            for idx1, two_parts in enumerate(result):
                print "第", idx1 + 1, "个"
                original_tags, recommend_data = two_parts
                if recommend_data['item_id'] not in filter_item_ids:
                    continue

                print "试题ID", recommend_data['item_id']
                print "试题内容", recommend_data['item_content']
                uprint(u"关键词列表 => 熵", recommend_data['features_weight'])
                uprint(u"原始标签:", original_tags)
                uprint(u"推荐标签:", recommend_data['recommend_tags'])
                uprint(u"推荐细节:", recommend_data['recommend_tags_detail'])
                print "\n" * 3
        inspect_result(result)

        evaluate_items = [{"object_id": recommend_data['item_id'],
                           "original_tags": original_tags, "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags']]}
                          for original_tags, recommend_data in result]

        # 手工评估
        def ee(num=1):
            self.evaluate([{"object_id": recommend_data['item_id'],
                            "original_tags": original_tags,
                            "recommend_tags": [i1['name'] for i1 in recommend_data['recommend_tags'][0:num]]}
                           for original_tags, recommend_data in result])
        # --------------------------------------------

        # 初中物理 查看知识点树在每条路径上被引用的分布情况  [(0, 357), (1, 217), (2, 24), (3, 1)]
        self.tags_tree.distribution([i1['recommend_tags'] for i1 in evaluate_items])
        self.evaluate(evaluate_items)

        if ('eval_result' in evaluate_items[0]) and False:
            eval_items2 = filter(lambda i1: i1['eval_result'], evaluate_items)
            inspect_result(result, [i1['object_id'] for i1 in eval_items2])
            self.tags_tree.distribution([i1['original_tags'] for i1 in eval_items2])
        import pdb
        pdb.set_trace()
Пример #12
0
    def load_features_with_weight(self, tag_to_features_count__dict):
        for tag1, features_counter1 in process_notifier(
                tag_to_features_count__dict):
            for node1 in self.name_to_nodes[tag1]:
                node1.features_weight = self.classify.calculate_features_weight(
                    features_counter1)

                for f1 in node1.features_weight.keys():  # temp Fix
                    if f1 in self.classify.features_weight_machine.stop_words_set:
                        del node1.features_weight[f1]

                for feature1 in features_counter1:
                    self.feature_to_nodes[feature1].add(node1)
        return self
Пример #13
0
        def load__nested_region_dict():
            # data format
            # [ {"name":"浙江", "code":31, "parent_code":1}, ... ]
            data = list(self.nested_region_data__func())
            assert len(data) > 0
            assert isinstance(data[0], dict)
            assert "name"        in data[0]
            assert "code"        in data[0]
            assert "parent_code" in data[0]

            print "load name_to_codes__dict ..."
            name_to_codes__dict = defaultdict(list)
            for d1 in process_notifier(data):
                name_to_codes__dict[ru_regexp.strip_regexp.sub("", d1['name'])].append(d1['code'])
            name_to_codes__dict = dict(name_to_codes__dict)

            print "load code_to_name__dict ..."
            code_to_name__dict = { d1['code'] : d1['name'] for d1 in process_notifier(data) }

            print "load codes_relations ..."
            codes_relations = { d1['code'] : d1['parent_code'] for d1 in process_notifier(data) }

            return [name_to_codes__dict, code_to_name__dict, codes_relations]
Пример #14
0
        def func():
            # 1. fetch all features
            uniq_keys = set([])
            for item_id1, item1 in process_notifier(d1):
                [uniq_keys.add(k1) for k1 in item1.iterkeys()]
            uniq_keys = list(uniq_keys)

            # 2. feature1 => {doc1: count1, doc2: count2, ...}
            value_cache = defaultdict(dict)
            for item_id1, item1 in process_notifier(d1):
                for k1, c1 in item1.iteritems():
                    value_cache[k1][item_id1] = c1

            # 3. calculate each feauture's entropy
            entropy_cache = dict()
            total_len = len(d1)
            for k1 in process_notifier(uniq_keys):
                exist_values = value_cache[k1].values()
                total_values = exist_values + [0] * (total_len - len(value_cache))

                entropy_cache[k1] = scipy_entropy(total_values)

            return entropy_cache
Пример #15
0
    def run(self):
        self.run_before_hook()

        # 1. check is already done.
        if self.is_collection_exists():
            print "[info] %s already exists!" % (self.data_file_collection_model, )
            return False

        # 2. check report status collection is valid
        if self.report_status_collection_model.count() == 0:
            self.report_status_collection_model.insert(
                {self.report_status_namespace: {}})
        assert self.report_status_collection_model.count() == 1, "更新纪录 只能有一条!"

        # 3. output json with err
        data_file1 = self.source_task_instance.data_file
        source1 = luigi.HDFS(data_file1)
        tmp_file1 = open(self.tmp_filepath, 'w')

        for line1 in process_notifier(
                TargetUtils.line_read(source1), u"[read lines] %s" % source1):
            tmp_file1.write(line1 + "\n")
        tmp_file1.close()

        # 4. upload to mongodb
        CommandUtils.execute(self.mongo_ensure_index)
        CommandUtils.execute(self.mongoimport_command)

        # 5. clean tmp
        CommandUtils.execute("rm -f %s" % self.tmp_filepath)

        # 6. update report status
        item1 = self.report_status_collection_model.find()[0]
        del item1['_id']
        item1[self.report_status_namespace][self.report_name] = {
            'collection_name': self.collection_name,
            'updated_at': arrow.now().datetime,
        }
        self.report_status_collection_model.find_and_modify(
            query={},
            update={"$set": item1},
            full_response=True
        )

        self.run_after_hook()

        return True
Пример #16
0
        def load__region_unit_data():
            data = list(self.region_unit_data__func())
            assert len(data) > 0
            assert isinstance(data[0], dict)
            assert "id"          in data[0]
            assert "name"        in data[0]

            feature_to_unit_ids__dict = defaultdict(list)
            id_to_name__dict = dict()
            for line1 in process_notifier(data):
                id_to_name__dict[line1['id']] = line1['name']
                features = jieba_parse(line1['name'])
                # TODO 移除特殊字符, 比如 "-"
                source1_region = ru_regexp.separate_regiones(line1['name'])[0]

                for kv in itertools.chain(*self.get_region_lines(source1_region)):
                    features.extend(kv.values())
                for feature1 in set(features):
                    feature_to_unit_ids__dict[feature1].append(line1['id'])
            return [id_to_name__dict, dict(feature_to_unit_ids__dict)]
Пример #17
0
    def cache__total_tag_to_features__dict(self):
        total_tag_to_features__dict = defaultdict(lambda: defaultdict(int))

        # calculate model_cache's freq distribution
        test_item_ids = self.model.test_item_ids()
        for item_id1, item1 in process_notifier(self.model):
            if item_id1 in test_item_ids:
                continue

            item1_features_dict = self.extract_features_weight(item1)

            for tag2 in self.model.tags_model__extract_tags(item1):
                dict3 = total_tag_to_features__dict[tag2]
                for feature4, count4 in item1_features_dict.iteritems():
                    dict3[feature4] += count4

        # remove defaultdict's func
        for k1 in total_tag_to_features__dict.keys():
            total_tag_to_features__dict[k1] = dict(total_tag_to_features__dict[k1])

        return dict(total_tag_to_features__dict)
Пример #18
0
    def cache__total_tag_to_features__dict(self):
        total_tag_to_features__dict = defaultdict(lambda: defaultdict(int))

        # calculate model_cache's freq distribution
        test_item_ids = self.model.test_item_ids()
        for item_id1, item1 in process_notifier(self.model):
            if item_id1 in test_item_ids:
                continue

            item1_features_dict = self.extract_features_weight(item1)

            for tag2 in self.model.tags_model__extract_tags(item1):
                dict3 = total_tag_to_features__dict[tag2]
                for feature4, count4 in item1_features_dict.iteritems():
                    dict3[feature4] += count4

        # remove defaultdict's func
        for k1 in total_tag_to_features__dict.keys():
            total_tag_to_features__dict[k1] = dict(
                total_tag_to_features__dict[k1])

        return dict(total_tag_to_features__dict)
Пример #19
0
    def evaluate_effect(self, default_guess_count=10):
        if not self.is_run_test:
            print "没有设置is_run_test=True!"
            exit(0)

        test_items = self.model.test_items()
        print "要评估", len(test_items), "个条目"

        self.tags_tree.inspect('features_weight')

        result = [[
            self.model.tags_model__extract_tags(item1),
            self.recommend_tags(item1)
        ] for item1 in process_notifier(test_items)]

        def inspect_result(result, filter_item_ids=set([])):
            for idx1, two_parts in enumerate(result):
                print "第", idx1 + 1, "个"
                original_tags, recommend_data = two_parts
                if recommend_data['item_id'] not in filter_item_ids:
                    continue

                print "试题ID", recommend_data['item_id']
                print "试题内容", recommend_data['item_content']
                uprint(u"关键词列表 => 熵", recommend_data['features_weight'])
                uprint(u"原始标签:", original_tags)
                uprint(u"推荐标签:", recommend_data['recommend_tags'])
                uprint(u"推荐细节:", recommend_data['recommend_tags_detail'])
                print "\n" * 3

        inspect_result(result)

        evaluate_items = [{
            "object_id":
            recommend_data['item_id'],
            "original_tags":
            original_tags,
            "recommend_tags":
            [i1['name'] for i1 in recommend_data['recommend_tags']]
        } for original_tags, recommend_data in result]

        # 手工评估
        def ee(num=1):
            self.evaluate([{
                "object_id":
                recommend_data['item_id'],
                "original_tags":
                original_tags,
                "recommend_tags":
                [i1['name'] for i1 in recommend_data['recommend_tags'][0:num]]
            } for original_tags, recommend_data in result])

        # --------------------------------------------

        # 初中物理 查看知识点树在每条路径上被引用的分布情况  [(0, 357), (1, 217), (2, 24), (3, 1)]
        self.tags_tree.distribution(
            [i1['recommend_tags'] for i1 in evaluate_items])
        self.evaluate(evaluate_items)

        if ('eval_result' in evaluate_items[0]) and False:
            eval_items2 = filter(lambda i1: i1['eval_result'], evaluate_items)
            inspect_result(result, [i1['object_id'] for i1 in eval_items2])
            self.tags_tree.distribution(
                [i1['original_tags'] for i1 in eval_items2])
        import pdb
        pdb.set_trace()
Пример #20
0
    def connect(cls, original_model, **kwargs):
        # assert original_model's behavior
        process_notifier(original_model)

        # setup args
        # NOTE below code is copied to README.
        default_kwargs = {
            'cache_dir': os.getenv("ModelCacheDir"),  # the default.

            # available storage_types are ['memory', 'sqlite', 'shelve', 'redis']. default
            # is 'shelve', which is faster than sqlite
            'storage_type': 'shelve',

            # Sync unless current len(ModelCache) is less than original_model in `percentage`.
            # NOTE not used
            'percentage': 0.9999,

            'filter_lambda': lambda item1: False,
            'read_id_lambda': lambda item1: str(item1['_id']),
            'included_class': object,
        }
        for k1, v1 in kwargs.iteritems():
            if k1 in default_kwargs:
                default_kwargs[k1] = v1

        # validate storage
        assert default_kwargs['storage_type'] in valid_storages
        if (default_kwargs['cache_dir'] is None) and (default_kwargs['storage_type'] != "memory"):
            raise Exception(u"`cache_dir` should not be None when storage_type is not memory.")

        cache_dir = default_kwargs['cache_dir']
        del default_kwargs['cache_dir']

        # decorate class
        def _model_cache_decorator(decorated_class):
            # 1. included_class should not overwrite ModelCacheClass's important methods,
            #    include `__init__`, `init__before`, `init__after`.
            # 2. ensure decorated_class's methods will overwrite ModelCache's.

            for k1 in ['init__before', 'init__after']:
                if k1 in dir(default_kwargs['included_class']):
                    setattr(ModelCacheClass, k1, getattr(default_kwargs['included_class'], k1))

            class _model_cache(decorated_class, ModelCacheClass, default_kwargs['included_class']):

                class OriginalClass():
                    pass  # so we can setattr here.
                original = OriginalClass()
                for k1, v1 in default_kwargs.iteritems():
                    setattr(original, k1, v1)
                    del k1
                    del v1
                original.model = original_model

                # Thx http://stackoverflow.com/questions/4932438/how-to-create-a-custom-string-representation-for-a-class-object/4932473#4932473
                class MetaClass(type):
                    __metaclass__ = Forwardable
                    _ = def_delegators('datadict', dict_attrs)

                    def __repr__(self):
                        is_total_len_enough = len(self) > 5
                        while is_total_len_enough and (len(self.first_five_items) < 5):
                            for item_id1, item1 in self.iteritems():
                                self.first_five_items.append(item1)
                                if len(self.first_five_items) == 5:
                                    break

                        dots = ", ......" if is_total_len_enough else ""
                        return (u"<%s has %i items:[%s%s]>" %
                                (self.__name__, len(self),
                                 ", ".join([str(item1.item_id) for item1 in self.first_five_items]),
                                 dots, )).encode("UTF-8")
                __metaclass__ = MetaClass

                @classmethod
                def pickle_path(cls, name):
                    return cls.cache_dir + "/" + name + ".cPickle"

            _model_cache.__name__ = decorated_class.__name__
            _model_cache.__module__ = decorated_class.__module__  # so can pickle :)

            _model_cache.first_five_items = []

            _model_cache.cache_dir = os.path.join(cache_dir or u"", _model_cache.__name__)
            if default_kwargs['storage_type'] != 'memory':
                if not os.path.isdir(_model_cache.cache_dir):
                    os.makedirs(_model_cache.cache_dir)
            _model_cache.dbpath = None
            if _model_cache.cache_dir:
                _model_cache.dbpath = os.path.join(_model_cache.cache_dir, _model_cache.__name__ + ".db")

            _model_cache.connect()

            return _model_cache
        return _model_cache_decorator
Пример #21
0
 def filtered_item_ids_by_has_tags():
     print "[load from original data]\n"
     cls.pull_data()
     # 过滤掉 没有Tag的items
     return [item_id1 for item_id1, item1 in process_notifier(cls)
             if cls.tags_model__extract_tags(item1)]