def write_cns_core(items, version, formats=["excel", "jsonld"]): name = "cns-core" # write excel keys = [ "category", "@id", "name", "description", "supersededBy", "nameZh", "descriptionZh", "alternateName", "wikidataName", "wikidataUrl", "wikipediaUrl", "schemaorgUrl" ] # validation for key in keys: assert key in items[0].keys() # write excel if "excel" in formats: filename = "../data/releases/{}/{}.xls".format(version, name) filename = file2abspath(filename, __file__) json2excel(items, keys, filename) # write json-ld if "jsonld" in formats: for item in items: p = "alternateName" item[p] = split_string_by_comma(item.get(p, "")) filename = "../data/releases/{}/{}.jsonld".format(version, name) filename = file2abspath(filename, __file__) output = { "@context": { "@vocab": "http://cnschema.org/" }, "@graph": items } json2file(filename, output)
def write_cns_core(items, version): name = "cns-core" # write excel keys = [ "category", "@id", "name", "description", "supersededBy", "nameZh", "descriptionZh", "alternateName", "wikidataName", "wikidataUrl", "wikipediaUrl", "schemaorgUrl" ] for key in keys: assert key in items[0].keys() filename = "../data/releases/{}/{}.xls".format(version, name) filename = file2abspath(filename, __file__) #json2excel(items, keys, filename) # write json-ld for item in items: p = "alternateName" item[p] = split_string_by_comma(item.get(p, "")) filename = "../data/releases/{}/{}.jsonld".format(version, name) filename = file2abspath(filename, __file__) with codecs.open(filename, "w", encoding="utf-8") as f: output = { "@context": { # "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", # "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "@vocab": "http://cnschema.org/" }, "@graph": items } json.dump(output, f, ensure_ascii=False, sort_keys=True, indent=4)
def init_mapping(self): """ when install elastic search, initialize mapping for dynaic-templates mappingdynamic template https://www.elastic.co/guide/en/elasticsearch/reference/5.4/dynamic-templates.html es mapping -- completion https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-suggesters-completion.html python api https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/index.html sample mapping https://gist.github.com/deverton/2970285 """ template_name = "suggest2017" template_filename = "estemplate.suggest.json" template_filename = file2abspath(template_filename, __file__) # logging.info(template_filename) with open(template_filename) as f: template_body = json.load(f) # logging.info(template_body) # init template es = self.connect() ret = es.indices.put_template(template_name, body=template_body) logging.info(ret)
def task_cns_make_html(args=None): name = "cns-core" version = "3.2" site = "cnschema.org" items = read_cns_core_jsonld(version, path="data") map_id_cnschema = {} for item in items: schemaorg_id = item["schemaorgUrl"] map_id_cnschema[schemaorg_id] = item # update map_id_schemaorg with cnschem properties so = Schemaorg(version) map_id_schemaorg = so.load_data() for entry in map_id_schemaorg.values(): entry.update( json_dict_copy(map_id_cnschema.get(entry["@id"], {}), MAP_CNSCHEMA)) for p in ["isDomainOf", "isRangeOf"]: target_list = entry.get(p, []) for target in target_list: target.update( json_dict_copy(map_id_cnschema.get(target["@id"], {}), MAP_CNSCHEMA)) #rewrite map_id_schemaorg schema.org => cnschema.org items_new = schemaorg2cnschema(items_new) map_id_schemaorg = schemaorg2cnschema(map_id_schemaorg) filename = '../local/releases/3.2/cns-core.extend.json' filename = file2abspath(filename, __file__) json2file(map_id_schemaorg, filename) website = WebsiteV1(version, site, map_id_schemaorg) website.run()
def task_cns_template(args=None): mapping = { "version": [u"版本"], "domain": [u"直属分类"], "name": [u"规范属性名"], "nameZh": [u"cnschema属性名"], "alternateName": [u"中文属性名"], "nameSchemaorg": [u"schema.org属性名"], "nameWikidata": [u"wikidata属性名"], "descriptionWikipedia": [u"wikipedia定义"], "range": [u"预期的属性类型"], "value": [u"example value"], "jsonld": [u"example json-ld"], } filename = "../local/201707/cns-t-organization.xls" filename = file2abspath(filename, __file__) excel_data = excel2json(filename) bindings = collections.defaultdict(dict) for sheet_data in excel_data["data"].values(): if len(sheet_data) == 0: continue logging.info(len(sheet_data)) for row in sheet_data: logging.info(json.dumps(row, ensure_ascii=False)) domain = row["domain"] #row[u"直属分类"] propName = row[u"name"] rangeList = row[u"range"] bindings[domain][propName] = row break logging.info(bindings)
def read_cns_core(version): name = "cns-core" filename = "../local/releases/{}/{}.xls".format(version, name) filename = file2abspath(filename, __file__) temp = excel2json(filename) keys = temp["fields"].values()[0] items = temp["data"].values()[0] logging.info(len(items)) #cleanup itemsNew = [] for item in items: itemNew = {} for p in item.keys(): #skip commented fields if p.startswith("#"): continue itemNew[p] = item.get(p, "") if p == "description": itemNew[p] = clean_schemaorg_description(itemNew[p]) #if p == "@id": # itemNew["schemaorgUrl"] = itemNew[p] # itemNew[p] = re.sub("http://schema.org", "http://cnschema.org", itemNew[p]) itemsNew.append(itemNew) items = itemsNew #keys = [x for x in keys if x in items[0].keys()] return items
def read_cns_core_jsonld(version, path="data"): name = "cns-core" filename = "../{}/releases/{}/{}.jsonld".format(path, version, name) filename = file2abspath(filename, __file__) items = file2json(filename)["@graph"] return items
def read_cns_core_excel(version, path="local"): name = "cns-core" filename = "../{}/releases/{}/{}.xls".format(path, version, name) filename = file2abspath(filename, __file__) temp = excel2json(filename) keys = temp["fields"].values()[0] items = temp["data"].values()[0] logging.info(len(items)) return items
def task_superclasses(args): filename = "../local/releases/3.2/schema_taxonomy.json" filename = file2abspath(filename, __file__) data = file2json(filename) pairs = [] loadmapping(data, [], pairs) logging.info(json.dumps(pairs, indent=4, ensure_ascii=False)) mapping = collections.defaultdict(list) for pair in pairs: key = pair["to"] mapping[key].append(pair["to"]) for pair in pairs: key = pair["to"] for parent in pair["path"]: if parent not in mapping[key]: mapping[key].append(parent) logging.info(json.dumps(mapping, indent=4, ensure_ascii=False)) filename = "../data/releases/3.2/schema.superclass.json" filename = file2abspath(filename, __file__) json2file(mapping, filename)
def copy_website_base(self): # from github #url = "https://github.com/schemaorg/schemaorg/tree/master/docs" filepath = "../website" filepath = file2abspath(filepath, __file__) from shutil import copyfile for path, subdirs, files in os.walk(filepath): for name in files: filename_in = os.path.join(path, name) name_x = filename_in[len(filepath) + 1:] filename = os.path.join(self.dir_output, name_x) print name_x create_dir_if_not_exist(filename) copyfile(filename_in, filename)
def read_cns_core_excel(version, path="data"): if version == "3.2": return read_cns_core_excel_v1(version, path) name = "schemaorg_translate" filename = "../{}/releases/{}/{}.xlsx".format(path, version, name) filename = file2abspath(filename, __file__) temp = excel2json(filename) keys = temp["fields"][version] items = temp["data"][version] # enhance with @id for item in items: item["@id"] = "http://cnschema.org/{}".format(item["name"]) logging.info(len(items)) return items
def read_cns_core_excel(version, path="data"): if version == "3.2": return read_cns_core_excel_v1(version, path) # https://docs.google.com/spreadsheets/d/1mpiBxI5rK_qs86IpbXgN1xbhrxS_VYF0XjI_fcRpl00/edit#gid=364353024 name = "schemaorg_translate" filename = "../{}/releases/{}/{}.xlsx".format(path, version, name) filename = file2abspath(filename, __file__) temp = excel2json(filename) keys = temp["fields"][version] items = temp["data"][version] # enhance with @id for item in items: item["@id"] = "http://cnschema.org/{}".format(item["name"]) logging.info(len(items)) return items
def test_excel2json(self): filename = "ex2.xls" filename = file2abspath(filename, __file__) if not os.path.exists(filename): # init_excel(): input_data = [{ "name": u"张三", u"年龄": 18 }, { "name": u"李四", "notes": u"this is li si", u"年龄": 18 }] json2excel(input_data, ["name", u"年龄", "notes"], filename) output_data = excel2json(filename) assert len(output_data) == 2 assert len(output_data["data"]) == 1 assert len(output_data["data"].values()[0]) == 2 assert output_data["fields"].values()[0] == ["name", u"年龄", "notes"]
def __init__(self, version): self.version = version self.url_base = "https://github.com/schemaorg/schemaorg/raw/sdo-callisto" self.dir_output = "../local/releases/{}".format(version) self.dir_output = file2abspath(self.dir_output, __file__)
def __init__(self): filename = "es.cns.json" filename = file2abspath(filename, __file__) self.es_config = json.load(open(filename)) self.conn = None
if p in node: node_id = node[p]["@id"] the_node = self.map_id_node[node_id] the_node["_supersede"] = self._copy_node(node, PLIST_REF) #logging.info(the_node) #exit() def task_init(args=None): so = Schemaorg("3.2") data = so.load_data() stat(data.values(), [], ["_group", "@type"]) logging.info(len(data)) if __name__ == "__main__": logging.basicConfig( format= '[%(levelname)s][%(asctime)s][%(module)s][%(funcName)s][%(lineno)s] %(message)s', level=logging.INFO) logging.getLogger("requests").setLevel(logging.WARNING) filename = '../local/cache' filename = file2abspath(filename, __file__) requests_cache.install_cache(filename) main_subtask(__name__) """ python schemaorg.py task_init """
def _load_item_data(self, version): # load cns-core data filename = "../data/releases/{}/cns-core.jsonld".format(version) filename = file2abspath(filename, __file__) items = file2json(filename)["@graph"] logging.info(len(items)) fileds_index_suggest = ["name","nameZh"] fileds_index_search = ["name","nameZh","description", "descriptionZh", "wikidataName"] fields_suggest_payload = ["@id", "name","nameZh", "description", "descriptionZh", "wikidataName", "wikidataUrl","wikipediaUrl"] es_index = self.es_config["es_index"] es_type = self.es_config["es_type"] for item in items: # add suggestion field index_suggest = [] index_search = [] suggest_payload = {} for p, v in item.items(): if p in fields_suggest_payload: suggest_payload[p] = v if v: vx = v if isinstance(v, unicode): #remove markups vx = re.sub(ur"<[^>]+>","",vx) #remove url in description vx = re.sub(ur"[hH][tT][tT][pP][s|S]?://[\S]+","",vx) if p in fileds_index_suggest: index_suggest.append(vx) if p in fileds_index_search: index_search.append(vx) item["id"] = any2sha1(item["@id"]) #logging.info(item["id"]) item["index_wildcard"] = u"".join(index_suggest) item["index_search"] = u"".join(index_search) if len(item["index_wildcard"])==0: logging.info(json.dumps(item, indent=4)) exit() item["index_suggest"] = { "input": index_suggest, #"output": u"{}({})".format(item["name"],item["nameZh"]), #"payload" : suggest_payload, } yield { "_id": item["id"], "_index": es_index, "_type": es_type, "_source": item }