Пример #1
0
def read_cns_core_jsonld(version, path="data"):
    name = "cns-core"
    filename = "../{}/releases/{}/{}.jsonld".format(path, version, name)
    filename = file2abspath(filename, __file__)
    items = file2json(filename)["@graph"]

    return items
Пример #2
0
    def load_data(self):
        filename_cache = os.path.join(self.dir_output, "schemaorg.json")
        if os.path.exists(filename_cache):
            return file2json(filename_cache)

        #examples
        self._init_examples()

        # the word count stats 2015
        self._init_stat2015()

        # init the schema, with information from examples and stats
        self._init_schema()

        json2file(self.map_id_node, filename_cache)

        return self.map_id_node
Пример #3
0
def task_superclasses(args):
    filename = "../local/releases/3.2/schema_taxonomy.json"
    filename = file2abspath(filename, __file__)
    data = file2json(filename)
    pairs = []
    loadmapping(data, [], pairs)
    logging.info(json.dumps(pairs, indent=4, ensure_ascii=False))

    mapping = collections.defaultdict(list)
    for pair in pairs:
        key = pair["to"]
        mapping[key].append(pair["to"])

    for pair in pairs:
        key = pair["to"]
        for parent in pair["path"]:
            if parent not in mapping[key]:
                mapping[key].append(parent)

    logging.info(json.dumps(mapping, indent=4, ensure_ascii=False))
    filename = "../data/releases/3.2/schema.superclass.json"
    filename = file2abspath(filename, __file__)
    json2file(mapping, filename)
Пример #4
0
    def _load_item_data(self, version):
        # load cns-core data
        filename = "../data/releases/{}/cns-core.jsonld".format(version)
        filename = file2abspath(filename, __file__)
        items = file2json(filename)["@graph"]
        logging.info(len(items))

        fileds_index_suggest = ["name","nameZh"]
        fileds_index_search = ["name","nameZh","description", "descriptionZh", "wikidataName"]
        fields_suggest_payload = ["@id", "name","nameZh", "description", "descriptionZh", "wikidataName", "wikidataUrl","wikipediaUrl"]

        es_index = self.es_config["es_index"]
        es_type = self.es_config["es_type"]

        for item in items:

            # add suggestion field
            index_suggest = []
            index_search = []
            suggest_payload = {}
            for p, v in item.items():

                if p in fields_suggest_payload:
                    suggest_payload[p] = v

                if v:
                    vx = v
                    if isinstance(v, unicode):
                        #remove markups
                        vx = re.sub(ur"<[^>]+>","",vx)

                        #remove url in description
                        vx = re.sub(ur"[hH][tT][tT][pP][s|S]?://[\S]+","",vx)

                    if p in fileds_index_suggest:
                        index_suggest.append(vx)

                    if p in fileds_index_search:
                        index_search.append(vx)

            item["id"] = any2sha1(item["@id"])
            #logging.info(item["id"])

            item["index_wildcard"] =  u"".join(index_suggest)
            item["index_search"] =  u"".join(index_search)

            if len(item["index_wildcard"])==0:
                logging.info(json.dumps(item, indent=4))
                exit()

            item["index_suggest"] = {
                "input": index_suggest,
                #"output": u"{}({})".format(item["name"],item["nameZh"]),
                #"payload" : suggest_payload,
            }

            yield {
                "_id": item["id"],
                "_index": es_index,
                "_type": es_type,
                "_source": item
            }