Exemplo n.º 1
0
    def lookup_merge_func(self, site, topic_id, path, ttype, plugin_merge_funcs):
        # 1. Lookup with specific site info
        # 2. Lookup with topic only
        # 3. Lookup with attribute type
        # 4. Use the plugin default merge rule
        # 5. Use default merge rule : the new one overrides the old one

        str_path = PathUtil.get_match_path(path)

        rule, exist = lookup_dict_path(self.topic_site_rule_dict, [topic_id, site] + [str_path])
        # exist is used to determine whether 'None' is explicitly put in the dict
        if rule is None:
            rule, exist = lookup_dict_path(self.topic_site_rule_dict, [topic_id, site] + [str_path])
        if rule is None:
            rule, exist = lookup_dict_path(self.topic_site_rule_dict, [topic_id, '*'] + [str_path])
        if rule is None:
            rule, exist = lookup_dict_path(self.topic_site_rule_dict, [ttype])
        if rule is None:
            ttype = SMMergeType.DEFAULT
            spec = None
        else:
            ttype = rule.type
            spec = rule.spec

        merge_func = plugin_merge_funcs.get(ttype)
        if merge_func is None:
            merge_func = self.merge_func_dict.get(ttype)
        if merge_func is None:
            raise Exception("single src merge func not registed")

        return lambda old, old_trace, input, input_ts, input_url, k, path, existing_meta : \
            merge_func(old, old_trace, input, input_ts, input_url, k, path, existing_meta, spec=spec)
Exemplo n.º 2
0
    def validate(self, doc, topic_id, site):
        pks = self.topic_manager.get_primary_keys_by_id(topic_id)
        # pks is a list consisting of the columns of pk, url will be used as pk
        # if this list is empty, so no need to check here (Every doc fragment MUST has url)
        fail = False
        lack_pks = []
        if len(pks) > 0:
            for col in pks:

                value, exists = lookup_dict_path(doc, col)
                if not exists:
                    lack_pks.append(col)
                    fail = True
                elif value is None or str(value).strip() == '':
                    lack_pks.append(col)
                    fail = True
        if fail:
            raise ValidateError('Primary key validate error:Lack of %s' %
                                (str(lack_pks)),
                                lack_pks,
                                is_lacking_required=True)
Exemplo n.º 3
0
    def process(self, download_time, site, url, doc_fragment, topic_id,
                plugin_merge_funcs):
        from i_util.global_defs import MetaFields
        site_record_id = doc_fragment['_site_record_id']
        rk = make_hbase_single_src_rk(site, doc_fragment, topic_id)
        table_name = self.topic_manager.get_table_name_by_id(
            topic_id) + "_single_src"
        latest_merged_doc, meta = self.fetch_existing_fragment(table_name, rk)
        latest_merged_doc_copy = copy.deepcopy(latest_merged_doc)

        def give_up():
            return None, latest_merged_doc_copy, rk, meta, False, table_name

        # if MetaFields.TOPO_TOKEN_RAW in doc_fragment:
        #     if doc_fragment[MetaFields.TOPO_TOKEN_RAW] in meta[HBaseDefs.SINGLE_SRC_SEEN_TOPO_TOKENS]:
        #         log.warning("This topo token has been seen before : " + doc_fragment[MetaFields.TOPO_TOKEN_RAW])
        #         return give_up()
        #     else:
        #         meta[HBaseDefs.SINGLE_SRC_SEEN_TOPO_TOKENS].append(doc_fragment[MetaFields.TOPO_TOKEN_RAW])

        do_merge = False
        token_based = False
        clear_existing = False
        new_merge = False
        merge_base = None
        merge_base_path = None
        incomming_uuid = ""
        incomming_topo_token = doc_fragment.get(MetaFields.TOPO_TOKEN, None)
        if incomming_topo_token is not None:
            token_based = True

        if meta[HBaseDefs.SINGLE_SRC_IS_TOPO_TOKEN_BASED] not in [
                None, token_based
        ]:
            log.debug("token_based changed, drop old data and restart merging")
            clear_existing = True

        if not clear_existing:

            if token_based:
                topo_info = TopoInfoGenerator(token=incomming_topo_token)
                # Topo token based merge, find the base doc to merge the incomming fragment
                incomming_uuid = topo_info.get_uuid()
                if meta[HBaseDefs.SINGLE_SRC_CURRENT_UUID] == '':
                    log.debug(
                        "Process uuid = %s for the first time, site_record_id = %s"
                        % (incomming_uuid, site_record_id))
                    new_merge = True
                    meta[HBaseDefs.SINGLE_SRC_CURRENT_UUID] = incomming_uuid
                    meta[HBaseDefs.SINGLE_SRC_IS_TOPO_TOKEN_BASED] = True
                elif incomming_uuid != meta[HBaseDefs.SINGLE_SRC_CURRENT_UUID]:
                    # We got a fragment from a doc different from the current one
                    # Start merging from beginning
                    log.debug(
                        "Got a fragment from a different crawling process of this doc, clear existing fragments. "
                        "new uuid = %s old uuid = %s, site_record_id = %s" %
                        (incomming_uuid,
                         meta[HBaseDefs.SINGLE_SRC_CURRENT_UUID],
                         site_record_id))
                    clear_existing = True

                if topo_info.is_base_doc():
                    merge_base, merge_base_path = latest_merged_doc, []
                else:
                    merge_base, merge_base_path = find_target_subdoc(
                        latest_merged_doc, MetaFields.TOPO_TOKEN,
                        incomming_topo_token)

                if merge_base is not None:
                    do_merge = True
                else:
                    # We meet a new dangling fragment that cannot be merged by now, save it
                    log.debug(
                        "Got a fragment that cannot be merged by now, save it as dangling. uuid = %s, topo token = %s, "
                        "site_record_id = %s" %
                        (incomming_uuid, incomming_topo_token, site_record_id))
                    meta[HBaseDefs.SINGLE_SRC_DANGLING_FRAGMENTS].append(
                        (doc_fragment, download_time, url))
                    do_merge = False

            else:
                # not a topo token based fragment, merge it to the root doc
                log.debug(
                    "Got a fragment without topo token, merge it to the base doc, site_record_id = %s"
                    % (site_record_id))
                token_based = False
                do_merge = True
                merge_base = latest_merged_doc
                merge_base_path = []
                meta[HBaseDefs.SINGLE_SRC_IS_TOPO_TOKEN_BASED] = False

        if clear_existing:
            latest_merged_doc = {}
            new_meta = copy.deepcopy(SingleSourceMergerImpl.initial_meta)
            new_meta[HBaseDefs.SINGLE_SRC_SEQ] = meta[HBaseDefs.SINGLE_SRC_SEQ]
            meta[HBaseDefs.SINGLE_SRC_IS_TOPO_TOKEN_BASED] = token_based
            meta[HBaseDefs.SINGLE_SRC_CURRENT_UUID] = incomming_uuid
            merge_base = latest_merged_doc
            merge_base_path = []
            meta = new_meta
            do_merge = True

        log.debug(
            "Operations for incomming doc: do_merge = %s, new_merge = %s, "
            "clear_existing = %s,  token_based = %s, merge_base_path = %s" %
            (str(do_merge), str(new_merge), str(clear_existing),
             str(token_based),
             json.dumps(PathUtil.get_human_readable_path(merge_base_path))))

        if do_merge:
            trace_base, exist = lookup_dict_path(
                meta[HBaseDefs.SINGLE_SRC_ATTR_TIME_TRACE], merge_base_path)
            assert exist
            if meta[HBaseDefs.
                    SINGLE_SRC_META_SEEN_LATEST_DL_TIME] < download_time:
                meta[HBaseDefs.
                     SINGLE_SRC_META_SEEN_LATEST_DL_TIME] = download_time
            meta[HBaseDefs.SINGLE_SRC_META_MERGED_TIMES] += 1

            new_merged_fragment, \
            merged_trace, \
            meta,\
            data_changed = \
                self.merge_fragment(site, topic_id, merge_base, trace_base,
                                    meta, doc_fragment, download_time, url,
                                    plugin_merge_funcs, base_path=merge_base_path)

            while True:
                if len(meta[HBaseDefs.SINGLE_SRC_DANGLING_FRAGMENTS]) == 0:
                    log.debug("No dangling fragment to process")
                    break
                else:
                    merge_happened = False
                    for i, (dangling, download_time_d, url_d) in enumerate(
                            meta[HBaseDefs.SINGLE_SRC_DANGLING_FRAGMENTS]):
                        dangling_topo_token = dangling[MetaFields.TOPO_TOKEN]
                        merge_base, merge_base_path = find_target_subdoc(
                            latest_merged_doc, MetaFields.TOPO_TOKEN,
                            dangling_topo_token)
                        if merge_base is not None:
                            merge_happened = True
                            msg = "dangling fragment %s can be merged, path = %s" % (
                                dangling_topo_token,
                                json.dumps(merge_base_path))
                            log.debug(msg)
                            # No need to check uuid, because dangling uuids must be the same with current uuid, otherwise
                            # dangling fragment will be cleared
                            trace_base, exist = lookup_dict_path(
                                meta[HBaseDefs.SINGLE_SRC_ATTR_TIME_TRACE],
                                merge_base_path)
                            assert exist
                            new_merged_fragment, \
                            merged_trace, \
                            meta, \
                            data_changed = \
                                self.merge_fragment(site, topic_id, merge_base, trace_base,
                                                    meta, dangling, download_time_d, url_d,
                                                    plugin_merge_funcs, merge_base_path)
                            del meta[
                                HBaseDefs.SINGLE_SRC_DANGLING_FRAGMENTS][i]
                            break
                        else:
                            log.debug(
                                "dangling fragment %s can not be merged" %
                                dangling_topo_token)
                    if not merge_happened:
                        log.debug("No more dangling fragment to process")
                        break

            self.put_merged_fragment(table_name, rk, latest_merged_doc, meta)
            return latest_merged_doc, latest_merged_doc_copy, rk, meta, data_changed, table_name
        else:
            self.save_meta(table_name, rk, meta)
            return give_up()