Пример #1
0
def write_cns_core(items, version, formats=["excel", "jsonld"]):
    name = "cns-core"

    # write excel
    keys = [
        "category", "@id", "name", "description", "supersededBy", "nameZh",
        "descriptionZh", "alternateName", "wikidataName", "wikidataUrl",
        "wikipediaUrl", "schemaorgUrl"
    ]

    # validation
    for key in keys:
        assert key in items[0].keys()

    # write excel
    if "excel" in formats:
        filename = "../data/releases/{}/{}.xls".format(version, name)
        filename = file2abspath(filename, __file__)
        json2excel(items, keys, filename)

    # write json-ld
    if "jsonld" in formats:
        for item in items:
            p = "alternateName"
            item[p] = split_string_by_comma(item.get(p, ""))

        filename = "../data/releases/{}/{}.jsonld".format(version, name)
        filename = file2abspath(filename, __file__)
        output = {
            "@context": {
                "@vocab": "http://cnschema.org/"
            },
            "@graph": items
        }
        json2file(filename, output)
Пример #2
0
def write_cns_core(items, version):
    name = "cns-core"

    # write excel
    keys = [
        "category", "@id", "name", "description", "supersededBy", "nameZh",
        "descriptionZh", "alternateName", "wikidataName", "wikidataUrl",
        "wikipediaUrl", "schemaorgUrl"
    ]

    for key in keys:
        assert key in items[0].keys()

    filename = "../data/releases/{}/{}.xls".format(version, name)
    filename = file2abspath(filename, __file__)
    #json2excel(items, keys, filename)

    # write json-ld
    for item in items:
        p = "alternateName"
        item[p] = split_string_by_comma(item.get(p, ""))

    filename = "../data/releases/{}/{}.jsonld".format(version, name)
    filename = file2abspath(filename, __file__)
    with codecs.open(filename, "w", encoding="utf-8") as f:
        output = {
            "@context": {
                #            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
                #            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
                "@vocab": "http://cnschema.org/"
            },
            "@graph": items
        }

        json.dump(output, f, ensure_ascii=False, sort_keys=True, indent=4)
Пример #3
0
    def init_mapping(self):
        """
        when install elastic search, initialize mapping for dynaic-templates

        mappingdynamic template
        https://www.elastic.co/guide/en/elasticsearch/reference/5.4/dynamic-templates.html
        es mapping -- completion
        https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-suggesters-completion.html

        python api
        https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/index.html

        sample mapping
        https://gist.github.com/deverton/2970285
        """

        template_name = "suggest2017"
        template_filename = "estemplate.suggest.json"
        template_filename = file2abspath(template_filename, __file__)
        # logging.info(template_filename)

        with open(template_filename) as f:
            template_body = json.load(f)
        # logging.info(template_body)

        # init template
        es = self.connect()
        ret = es.indices.put_template(template_name, body=template_body)
        logging.info(ret)
Пример #4
0
def task_cns_make_html(args=None):
    name = "cns-core"
    version = "3.2"
    site = "cnschema.org"

    items = read_cns_core_jsonld(version, path="data")
    map_id_cnschema = {}
    for item in items:
        schemaorg_id = item["schemaorgUrl"]
        map_id_cnschema[schemaorg_id] = item

    # update map_id_schemaorg with cnschem properties
    so = Schemaorg(version)
    map_id_schemaorg = so.load_data()
    for entry in map_id_schemaorg.values():
        entry.update(
            json_dict_copy(map_id_cnschema.get(entry["@id"], {}),
                           MAP_CNSCHEMA))
        for p in ["isDomainOf", "isRangeOf"]:
            target_list = entry.get(p, [])
            for target in target_list:
                target.update(
                    json_dict_copy(map_id_cnschema.get(target["@id"], {}),
                                   MAP_CNSCHEMA))

    #rewrite map_id_schemaorg schema.org => cnschema.org    items_new = schemaorg2cnschema(items_new)
    map_id_schemaorg = schemaorg2cnschema(map_id_schemaorg)

    filename = '../local/releases/3.2/cns-core.extend.json'
    filename = file2abspath(filename, __file__)
    json2file(map_id_schemaorg, filename)

    website = WebsiteV1(version, site, map_id_schemaorg)
    website.run()
Пример #5
0
def task_cns_template(args=None):
    mapping = {
        "version": [u"版本"],
        "domain": [u"直属分类"],
        "name": [u"规范属性名"],
        "nameZh": [u"cnschema属性名"],
        "alternateName": [u"中文属性名"],
        "nameSchemaorg": [u"schema.org属性名"],
        "nameWikidata": [u"wikidata属性名"],
        "descriptionWikipedia": [u"wikipedia定义"],
        "range": [u"预期的属性类型"],
        "value": [u"example value"],
        "jsonld": [u"example json-ld"],
    }

    filename = "../local/201707/cns-t-organization.xls"
    filename = file2abspath(filename, __file__)
    excel_data = excel2json(filename)
    bindings = collections.defaultdict(dict)
    for sheet_data in excel_data["data"].values():
        if len(sheet_data) == 0:
            continue
        logging.info(len(sheet_data))
        for row in sheet_data:
            logging.info(json.dumps(row, ensure_ascii=False))
            domain = row["domain"]  #row[u"直属分类"]
            propName = row[u"name"]
            rangeList = row[u"range"]
            bindings[domain][propName] = row

        break

    logging.info(bindings)
Пример #6
0
def read_cns_core(version):
    name = "cns-core"
    filename = "../local/releases/{}/{}.xls".format(version, name)
    filename = file2abspath(filename, __file__)

    temp = excel2json(filename)
    keys = temp["fields"].values()[0]
    items = temp["data"].values()[0]
    logging.info(len(items))

    #cleanup
    itemsNew = []
    for item in items:
        itemNew = {}
        for p in item.keys():
            #skip commented fields
            if p.startswith("#"):
                continue

            itemNew[p] = item.get(p, "")

            if p == "description":
                itemNew[p] = clean_schemaorg_description(itemNew[p])
            #if p == "@id":
            #    itemNew["schemaorgUrl"] = itemNew[p]
            #    itemNew[p] = re.sub("http://schema.org", "http://cnschema.org", itemNew[p])
        itemsNew.append(itemNew)
    items = itemsNew
    #keys = [x for x in keys if x in items[0].keys()]

    return items
Пример #7
0
def read_cns_core_jsonld(version, path="data"):
    name = "cns-core"
    filename = "../{}/releases/{}/{}.jsonld".format(path, version, name)
    filename = file2abspath(filename, __file__)
    items = file2json(filename)["@graph"]

    return items
Пример #8
0
def read_cns_core_excel(version, path="local"):
    name = "cns-core"
    filename = "../{}/releases/{}/{}.xls".format(path, version, name)
    filename = file2abspath(filename, __file__)

    temp = excel2json(filename)
    keys = temp["fields"].values()[0]
    items = temp["data"].values()[0]
    logging.info(len(items))

    return items
Пример #9
0
def task_superclasses(args):
    filename = "../local/releases/3.2/schema_taxonomy.json"
    filename = file2abspath(filename, __file__)
    data = file2json(filename)
    pairs = []
    loadmapping(data, [], pairs)
    logging.info(json.dumps(pairs, indent=4, ensure_ascii=False))

    mapping = collections.defaultdict(list)
    for pair in pairs:
        key = pair["to"]
        mapping[key].append(pair["to"])

    for pair in pairs:
        key = pair["to"]
        for parent in pair["path"]:
            if parent not in mapping[key]:
                mapping[key].append(parent)

    logging.info(json.dumps(mapping, indent=4, ensure_ascii=False))
    filename = "../data/releases/3.2/schema.superclass.json"
    filename = file2abspath(filename, __file__)
    json2file(mapping, filename)
Пример #10
0
    def copy_website_base(self):
        # from github
        #url = "https://github.com/schemaorg/schemaorg/tree/master/docs"
        filepath = "../website"
        filepath = file2abspath(filepath, __file__)
        from shutil import copyfile

        for path, subdirs, files in os.walk(filepath):
            for name in files:
                filename_in = os.path.join(path, name)
                name_x = filename_in[len(filepath) + 1:]
                filename = os.path.join(self.dir_output, name_x)
                print name_x
                create_dir_if_not_exist(filename)
                copyfile(filename_in, filename)
Пример #11
0
def read_cns_core_excel(version, path="data"):
    if version == "3.2":
        return read_cns_core_excel_v1(version, path)

    name = "schemaorg_translate"
    filename = "../{}/releases/{}/{}.xlsx".format(path, version, name)
    filename = file2abspath(filename, __file__)

    temp = excel2json(filename)
    keys = temp["fields"][version]
    items = temp["data"][version]

    # enhance with @id
    for item in items:
        item["@id"] = "http://cnschema.org/{}".format(item["name"])
    logging.info(len(items))

    return items
Пример #12
0
def read_cns_core_excel(version, path="data"):
    if version == "3.2":
        return read_cns_core_excel_v1(version, path)

    # https://docs.google.com/spreadsheets/d/1mpiBxI5rK_qs86IpbXgN1xbhrxS_VYF0XjI_fcRpl00/edit#gid=364353024
    name = "schemaorg_translate"
    filename = "../{}/releases/{}/{}.xlsx".format(path, version, name)
    filename = file2abspath(filename, __file__)

    temp = excel2json(filename)
    keys = temp["fields"][version]
    items = temp["data"][version]

    # enhance with @id
    for item in items:
        item["@id"] = "http://cnschema.org/{}".format(item["name"])
    logging.info(len(items))

    return items
Пример #13
0
    def test_excel2json(self):
        filename = "ex2.xls"
        filename = file2abspath(filename, __file__)

        if not os.path.exists(filename):
            # init_excel():
            input_data = [{
                "name": u"张三",
                u"年龄": 18
            }, {
                "name": u"李四",
                "notes": u"this is li si",
                u"年龄": 18
            }]
            json2excel(input_data, ["name", u"年龄", "notes"], filename)

        output_data = excel2json(filename)
        assert len(output_data) == 2
        assert len(output_data["data"]) == 1
        assert len(output_data["data"].values()[0]) == 2
        assert output_data["fields"].values()[0] == ["name", u"年龄", "notes"]
Пример #14
0
 def __init__(self, version):
     self.version = version
     self.url_base = "https://github.com/schemaorg/schemaorg/raw/sdo-callisto"
     self.dir_output = "../local/releases/{}".format(version)
     self.dir_output = file2abspath(self.dir_output, __file__)
Пример #15
0
 def __init__(self):
     filename = "es.cns.json"
     filename = file2abspath(filename, __file__)
     self.es_config = json.load(open(filename))
     self.conn = None
Пример #16
0
            if p in node:
                node_id = node[p]["@id"]
                the_node = self.map_id_node[node_id]
                the_node["_supersede"] = self._copy_node(node, PLIST_REF)
                #logging.info(the_node)
                #exit()


def task_init(args=None):
    so = Schemaorg("3.2")
    data = so.load_data()
    stat(data.values(), [], ["_group", "@type"])
    logging.info(len(data))


if __name__ == "__main__":
    logging.basicConfig(
        format=
        '[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s',
        level=logging.INFO)
    logging.getLogger("requests").setLevel(logging.WARNING)

    filename = '../local/cache'
    filename = file2abspath(filename, __file__)
    requests_cache.install_cache(filename)

    main_subtask(__name__)
"""
    python schemaorg.py task_init
"""
Пример #17
0
    def _load_item_data(self, version):
        # load cns-core data
        filename = "../data/releases/{}/cns-core.jsonld".format(version)
        filename = file2abspath(filename, __file__)
        items = file2json(filename)["@graph"]
        logging.info(len(items))

        fileds_index_suggest = ["name","nameZh"]
        fileds_index_search = ["name","nameZh","description", "descriptionZh", "wikidataName"]
        fields_suggest_payload = ["@id", "name","nameZh", "description", "descriptionZh", "wikidataName", "wikidataUrl","wikipediaUrl"]

        es_index = self.es_config["es_index"]
        es_type = self.es_config["es_type"]

        for item in items:

            # add suggestion field
            index_suggest = []
            index_search = []
            suggest_payload = {}
            for p, v in item.items():

                if p in fields_suggest_payload:
                    suggest_payload[p] = v

                if v:
                    vx = v
                    if isinstance(v, unicode):
                        #remove markups
                        vx = re.sub(ur"<[^>]+>","",vx)

                        #remove url in description
                        vx = re.sub(ur"[hH][tT][tT][pP][s|S]?://[\S]+","",vx)

                    if p in fileds_index_suggest:
                        index_suggest.append(vx)

                    if p in fileds_index_search:
                        index_search.append(vx)

            item["id"] = any2sha1(item["@id"])
            #logging.info(item["id"])

            item["index_wildcard"] =  u"".join(index_suggest)
            item["index_search"] =  u"".join(index_search)

            if len(item["index_wildcard"])==0:
                logging.info(json.dumps(item, indent=4))
                exit()

            item["index_suggest"] = {
                "input": index_suggest,
                #"output": u"{}({})".format(item["name"],item["nameZh"]),
                #"payload" : suggest_payload,
            }

            yield {
                "_id": item["id"],
                "_index": es_index,
                "_type": es_type,
                "_source": item
            }