def get_crawler_gamename(games_to_insert, logger): crawler_infos = [] gamename_mapping = get_mapping_gamename() logger.debug("has %d pkgs in mapping" % len(gamename_mapping)) for key in games_to_insert: key_info = key.split('<>') area = key_info[0] company = key_info[1] pkg_name = key_info[2] if pkg_name not in gamename_mapping: api = login(logger) info = get_info(api, pkg_name, logger) info = api.toDict(info) info = str(info) info = info.replace("true", "True") info = info.replace("false", "False") info = eval(str(info)) doc = info["docV2"] game_name = doc["title"] gama_name = ftoj(game_name) gamename_mapping[pkg_name] = game_name info = { "game_name": game_name, "pkg_name": pkg_name } crawler_infos.append(info) logger.debug("should insert %s pkg in mapping" % len(crawler_infos)) return crawler_infos
def get_terms(): scrapped_terms = (term for func in term_scrapers for term in func() if len(term) > 1) #known_terms = (term for lexicon_type in known_filenames for term in get_terms_from_file(lexicon_type) if len(term) > 1) known_terms = [] terms = itertools.chain.from_iterable([scrapped_terms, known_terms]) return (jianfan.ftoj(term) for term in terms)
def get_matches_query(query_obj): standardize = lambda text: jianfan.ftoj(text.lower()) def matches_query(post): inputs = [ standardize(arg) in standardize(post['text']) for arg in query_obj.args ] return query_obj.match(*inputs) return matches_query
def match_names(official, other): if official == other: return 'exact' elif official in other or other in official: return 'substring' tr = jianfan.ftoj(other) # Avoid repeated calls if official == tr: return 'translated' elif official in tr or tr in official: return 'translated substring' else: return False
def crawler(pkg_name): api = login() info = get_info(api, pkg_name) info = api.toDict(info) info = str(info) info = info.replace("true", "True") info = info.replace("false", "False") info = eval(str(info)) doc = info["docV2"] game_name = doc["title"] game_name = ftoj(game_name) return game_name
def _match_names(official, other): """Return a string describing the match between *official* and *other*.""" if official == other: return 'exact' elif official in other or other in official: return 'substring' tr = jianfan.ftoj(other) # Avoid repeated calls if official == tr: return 'translated' elif official in tr or tr in official: return 'translated substring' else: return False
def search(query): query = normalize(query) if is_cjk(query): search_field = 'simplified' search_field_display = 'Hanzi' # we can support both traditional and simplified queries, by converting traditional to simplified. # (traditional->simplified is many to one, which means it's much harder to go the other way.) # luckily someone make a pip-installable library jianfan! query = jianfan.ftoj(query) elif is_pinyin(query): search_field = 'pinyin' search_field_display = 'Pinyin' else: search_field = 'english_full' search_field_display = 'English' results_list = [] if search_field == 'english_full': whoosh_q = query_parser.parse(query) whoosh_results_list = searcher.search(whoosh_q) # whoosh returns dictionary-like Hit objects. convert to explicit dictionaries. whoosh_results_list = [dict(result) for result in whoosh_results_list] results_list.extend(whoosh_results_list) if not results_list: # only resort to mongo scanning if no results found yet. regex_str = query.replace(' ', '.*') # note: typically you want to escape user-generated strings before turning them into regexes. # in this case, i don't care. regex = re.compile(regex_str, re.UNICODE | re.IGNORECASE) results_list.extend(list( THE_DICTIONARY.find({search_field: regex}, {'_id': 0}).limit(MAX_RESULTS) )) # comment out this line to see the effect of search result ranking. results_list.sort(cmp=make_cmp(query, search_field)) results = dict( results = results_list, count = len(results_list), search_type = search_field_display, ) return simplejson.dumps(results, indent=4)
def search(query, age_check=None): query = jianfan.ftoj(query) logging.debug('Requesting 163 for query <{0}>'.format(repr(query))) params = {'q': query.encode('utf8'), 'per_page': 20} for page in range(25): params['page'] = page + 1 response = get_client().statuses_search(params) response = json.loads(response) if not response: break if type(response) is dict and response.get('error'): logging.warning(u'Netease has hit its access limit') break for result in response: if age_check and age_check(result.get('created_at')): raise StopIteration yield get_tweet(result)
def search(query, age_check=None): query = jianfan.ftoj(query) logging.debug('Requesting Tencent for query <{0}>'.format(repr(query))) for page in range(17): try: results = get_client().search.tweet(query, 30, page + 1) except qqweibo.error.QWeiboError as error: logging.warning(u'Received Tencent error <{0}>'.format( error.message)) if u'rate' in error.message: break continue usernames = [tweet.name for tweet in results] users = dict( (user.name, user) for user in get_client()._user_infos(usernames)) for tweet in results: if age_check and age_check(tweet.timestamp): raise StopIteration if query not in tweet.text: continue yield get_tweet(tweet, users)
def search(query, age_check=None): query = jianfan.ftoj(query) logging.debug('Requesting Sina for query <{0}>'.format(repr(query))) for page in range(25): uri = u'http://s.weibo.com/weibo/{0}?page={1}'.format(query, page + 1) response = requests.get(uri) next_page = False user_posts = collections.defaultdict(list) for post_info in scrape_posts(response.text): if age_check and age_check(post_info.get('datetime')): continue next_page = True # if at least one post one page passes age_check, try next page user_posts[post_info.pop('uid')].append(post_info) user_infos = get_user_infos(user_posts.keys()) for uid, post_infos in user_posts.items(): for post_info in post_infos: post_info.update(user_infos.get(uid) or {}) yield post_info if not next_page: break
def writetoaiml (input,outputset): global out,count out.write(" <category>\n") out.write(" <pattern>") words=ftoj(input.decode('utf-8')).encode('utf-8') words=words.replace("&","&") words=words.replace("<","<") words=words.replace(">",">") words=words.replace("'","'") words=words.replace('"',""") out.write(words) out.write("</pattern>\n") out.write(" <template>\n") if len(outputset)>=1: out.write(" <random>\n") for x in outputset: print input print ftoj(x.decode('utf-8')).encode('utf-8') out.write(" <li>") words=x.replace("&","&") words=words.replace("<","<") words=words.replace(">",">") words=words.replace("'","'") words=words.replace('"',""") words=ftoj(words.decode('utf-8')).encode('utf-8') out.write(words) out.write("</li>\n") count += 1 out.write(" </random>\n") else: print input x=outputset.pop() print ftoj(x.decode('utf-8')).encode('utf-8') words=x.replace("&","&") words=words.replace("<","<") words=words.replace(">",">") words=words.replace("'","'") words=words.replace('"',""") words=ftoj(words.decode('utf-8')).encode('utf-8') out.write(words+'\n') count += 1 out.write(" </template>\n") out.write(" </category>\n") out.flush() #count += 1 print count
def crawler(game_infos): update_data = [] for pkg_name in game_infos: try: api = login() info = get_info(api, pkg_name) info = api.toDict(info) info = str(info) info = info.replace("true", "True") info = info.replace("false", "False") info = eval(str(info)) doc = info["docV2"] game_name = doc["title"] game_name = ftoj(game_name) info = { "pkg_name": pkg_name, "game_name": game_name } update_data.append(info) except Exception as e: logger.debug("%s crawler error %s" % (pkg_name, e)) return update_data
def segmentToListPerQuery(queryString): listPerQuery = [] segedList = [] out1 = re.sub("[a-zA-Z]+", "", queryString) out1 = re.sub("[%s]" % re.escape(string.punctuation), "", out1) # segString = pseg.cut(queryString.decode("utf-8")) dd = jianfan.ftoj(out1).encode("utf-8") segString = pseg.cut(dd) # segString = pseg.cut(queryString.decode("utf-8")) # segString = jieba.cut(queryString,cut_all=False) # print ".. ".join(segString) # for i in segString: # listPerQuery.append(i) for z in segString: # print z.word + "\n" # if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m": if z.flag != "x": # segedList.append(z.word.encode("utf-8")) dd = jianfan.jtof(z.word).encode("utf-8") # segedList.append(dd) segedList.append(z.word) return segedList
def segmentToListPerQuery(queryString): listPerQuery = [] segedList = [] out1 = re.sub('[a-zA-Z]+', '', queryString) out1 = re.sub('[%s]' % re.escape(string.punctuation), '', out1) #segString = pseg.cut(queryString.decode("utf-8")) dd = jianfan.ftoj(out1).encode("utf-8") segString = pseg.cut(dd) #segString = pseg.cut(queryString.decode("utf-8")) #segString = jieba.cut(queryString,cut_all=False) #print ".. ".join(segString) #for i in segString: # listPerQuery.append(i) for z in segString: #print z.word + "\n" #if z.flag == "n" or z.flag == "ns" or z.flag == "v" or z.flag == "t" or z.flag == "a" or z.flag == "nr" or z.flag == "nz" or z.flag == "i" or z.flag == "m": if z.flag != "x": #segedList.append(z.word.encode("utf-8")) dd = jianfan.jtof(z.word).encode("utf-8") #segedList.append(dd) segedList.append(z.word) return segedList
def perpare_data(info, download_url, file_size, ver_code, ver_name, channel, apk_info): if not info: return doc = info["docV2"] #if channel == "GG官方": market_channel = channel if channel == "samsung": market_channel = "三星" game_name = doc["title"] pkg_name = apk_info["pkg_name"] game_desc = doc["descriptionHtml"] if game_desc: game_desc = game_desc.replace("<br>", "\n").replace("<p>", "\n").replace("<p>", "\n") app_details = doc['details']['appDetails'] game_types = app_details["appCategory"][0] try: game_types = google_game_type[game_types] except: game_types = '其他' downloaded_cnts = doc["details"]["appDetails"]["numDownloads"] developer = doc['details']['appDetails']['developerName'] utils.check_developer(developer) game_language = "多国语言" screen_shot_urls = "" icon_url = "" images = doc["image"] if images: for image in images: image_type = image["imageType"] image_url = image["imageUrl"] if image_type == 4: icon_url = image_url if image_type == 1: screen_shot_urls += image_url + "\n" is_crack_apk = 1 # 破解版 min_sdk = "" star_num = doc["aggregateRating"]["starRating"] now = str(int(time.time())) label_info = dict() pkg_info = dict() game_name = ftoj(game_name) game_desc = ftoj(game_desc) #if channel == "GG官方": g_name = game_name + u"(%s)" % channel if channel == "samsung": g_name = game_name + u"(samsung)" #g_name = game_name + u"(GG官方)" #adrooy #g_name = game_name + u"(samsung)" game_id = utils.gen_label_info_id(g_name) g_name = g_name.replace(u"(GG官方)", "") if ver_name: ver_name = filter(lambda ch: ch in '0123456789.', ver_name) if 'gameid' in apk_info: game_id = apk_info['gameid'] label_info["game_id"] = game_id label_info["game_name"] = g_name label_info["game_types"] = game_types label_info["origin_types"] = game_types label_info["screen_shot_urls"] = screen_shot_urls label_info["icon_url"] = icon_url label_info["detail_desc"] = game_desc label_info["star_num"] = utils.format_star_num(str(star_num), 2) label_info["download_counts"] = utils.format_install_num(downloaded_cnts) label_info["game_language"] = game_language label_info["now"] = now label_info["file_size"] = file_size label_info["ver_name"] = ver_name label_info["developer"] = developer pkg_info["market_channel"] = market_channel pkg_info["game_name"] = g_name pkg_info["pkg_name"] = pkg_name pkg_info["ver_code"] = ver_code pkg_info["ver_name"] = ver_name pkg_info["file_size"] = file_size pkg_info["download_urls"] = download_url.strip() pkg_info["game_desc"] = game_desc pkg_info["game_types"] = game_types pkg_info["origin_types"] = game_types pkg_info["downloaded_cnts"] = utils.format_install_num(downloaded_cnts) pkg_info["game_language"] = game_language pkg_info["screen_shot_urls"] = screen_shot_urls pkg_info["icon_url"] = icon_url pkg_info["now"] = now pkg_info["is_crack_apk"] = is_crack_apk if "ggvercode" not in apk_info: apk_info["ggvercode"] = "null" apk_id = utils.gen_pkg_info_id(0, pkg_name, ver_name, market_channel, apk_info["ggvercode"]) pkg_info["apk_id"] = apk_id pkg_info["game_id"] = game_id pkg_info[ "url4details"] = "https://play.google.com/store/apps/details?id=%s" % pkg_name #print apk_id, game_id #import sys #sys.exit() return label_info, pkg_info
def org_headquarters(self, slot_type): def safe_get_sublist(l, target, head_space, tail_space): result = [] target_index = [i for i, x in enumerate(l) if x[0] == target] for t in target_index: beg = t - head_space end = t + tail_space + 1 if beg >= 0 and end <= len(l): result = l[beg:end] elif beg >= 0: result = l[beg:] elif beg <= len(l): result = l[:end] else: result = [l[t]] return result gpe_list = [] if 'country' in slot_type: # load country list f = io.open('data/dict/country_list', 'r', -1, 'utf-8') for line in f: gpe_list.append(line.strip()) elif 'state' in slot_type: # load province list f = io.open('data/dict/china_province_dict', 'r', -1, 'utf-8') for line in f: gpe_list.append(line.strip()) elif 'city' in slot_type: # load city list china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb')) for p in china_province_city: if p['type'] == 0 and p['name'] != (u'台湾' or u'臺灣'): # type 0 means 直辖市 continue for c in p['sub']: gpe_list.append(c['name']) if p['name'] == (u'台湾' or u'臺灣'): continue for d in c['sub']: gpe_list.append(d['name']) line_outputs = [] for e in self.evidences[slot_type]: query_context = safe_get_sublist(e.parse_result['words'], self.query.name, 1, 0) # get context word around query for w in query_context: v = jianfan.ftoj(w[0]) for element in gpe_list: for r in [u'区', u'县', u'市']: v = v.replace(r, '') if element in w[0] and len(element) > 1: slot_filler = element l = self.create_line_output(e, slot_filler, 0, slot_type, combined_slot_filler=True) # ================ post filtering ================= # if u'友好' in l.slot_filler: continue line_outputs.append(l) return line_outputs
#!/usr/bin/env python from jianfan import ftoj import sys for line in sys.stdin.readlines(): print ftoj(line).encode("utf-8").rpartition("\n")[0]
#-*- coding:utf-8 import sys from jianfan import jtof, ftoj f = open(sys.argv[1]) s = ftoj(("".join(f.readlines())).decode('utf-8')) s2 = s.split(" ") f.close() f2 = open(sys.argv[2], "w") f2.write(("%s\t%s" % (s2[0], int(sys.argv[4]))).encode('utf-8')) f2.close() f3 = open(sys.argv[3], "w") f3.write(("%s|||%s|||%s" % (s2[1], s2[2], s2[3])).encode('utf-8')) f3.close()
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province+city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output+[l]
def perpare_data(info, apk_info): if not info: return doc = info["docV2"] game_name = doc["title"] game_desc = doc["descriptionHtml"] if game_desc: game_desc = game_desc.replace("<br>", "\n").replace("<p>", "\n").replace("<p>", "\n") app_details = doc['details']['appDetails'] game_types = app_details["appCategory"][0] try: game_types = google_game_type[game_types] except: game_types = '其他' downloaded_cnts = doc["details"]["appDetails"]["numDownloads"] developer = doc['details']['appDetails']['developerName'] game_language = "多国语言" screen_shot_urls = "" icon_url = "" images = doc["image"] if images: for image in images: image_type = image["imageType"] image_url = image["imageUrl"] if image_type == 4: icon_url = image_url if image_type == 1: screen_shot_urls += image_url + "\n" star_num = doc["aggregateRating"]["starRating"] #将繁体中文改为简体 game_name = ftoj(game_name) game_desc = ftoj(game_desc) developer = ftoj(developer) pkg_info = {} label_info = {} pkg_info['apk_id'] = apk_info['apk_id'] pkg_info['game_id'] = apk_info['gameid'] pkg_info['market_channel'] = apk_info['channel'] pkg_info['game_name'] = game_name pkg_info['pkg_name'] = apk_info['pkg_name'] pkg_info['ver_code'] = apk_info['ver_code'] pkg_info['ver_name'] = apk_info['ver_name'] pkg_info['file_size'] = apk_info['file_size'] pkg_info['download_url'] = apk_info['download_url'] pkg_info['game_desc'] = game_desc pkg_info['downloaded_cnts'] = downloaded_cnts pkg_info['game_language'] = game_language pkg_info['screen_shot_urls'] = screen_shot_urls pkg_info['icon_url'] = icon_url pkg_info['min_sdk'] = apk_info['min_sdk'] pkg_info['download_url_type'] = apk_info['download_url_type'] pkg_info['source'] = apk_info['source'] pkg_info['signature_md5'] = apk_info['signature_md5'] pkg_info['file_md5'] = apk_info['file_md5'] pkg_info['origin_types'] = game_types pkg_info['gpu_vender'] = apk_info['gpu_vender'] pkg_info['ver_code_by_gg'] = apk_info['ggvercode'] pkg_info['update_desc'] = apk_info['update_desc'] pkg_info['file_type'] = apk_info['file_type'] pkg_info['save_user'] = apk_info['save_user'] pkg_info['now'] = int(time.time()) label_info['game_id'] = apk_info['gameid'] label_info['game_name'] = game_name label_info['screen_shot_urls'] = screen_shot_urls label_info['icon_url'] = icon_url label_info['detail_desc'] = game_desc label_info['game_language'] = game_language label_info['file_size'] = apk_info['file_size'] label_info['ver_name'] = apk_info['ver_name'] label_info['source'] = apk_info['source'] label_info['origin_types'] = game_types label_info['developer'] = developer label_info['save_user'] = apk_info['save_user'] label_info['enabled'] = 0 label_info['now'] = int(time.time()) label_info['downloaded_cnts'] = downloaded_cnts label_info['star_num'] = star_num return label_info, pkg_info
def perpare_data(info, download_url, file_size, ver_code, ver_name, channel, apk_info): if not info: return doc = info["docV2"] #if channel == "GG官方": market_channel = channel if channel == "samsung": market_channel = "三星" game_name = doc["title"] pkg_name = apk_info["pkg_name"] game_desc = doc["descriptionHtml"] if game_desc: game_desc = game_desc.replace("<br>", "\n").replace("<p>", "\n").replace("<p>", "\n") app_details = doc['details']['appDetails'] game_types = app_details["appCategory"][0] try: game_types = google_game_type[game_types] except: game_types = '其他' downloaded_cnts = doc["details"]["appDetails"]["numDownloads"] developer = doc['details']['appDetails']['developerName'] utils.check_developer(developer) game_language = "多国语言" screen_shot_urls = "" icon_url = "" images = doc["image"] if images: for image in images: image_type = image["imageType"] image_url = image["imageUrl"] if image_type == 4: icon_url = image_url if image_type == 1: screen_shot_urls += image_url + "\n" is_crack_apk = 1 # 破解版 min_sdk = "" star_num = doc["aggregateRating"]["starRating"] now = str(int(time.time())) label_info = dict() pkg_info = dict() game_name = ftoj(game_name) game_desc = ftoj(game_desc) #if channel == "GG官方": g_name = game_name + u"(%s)" % channel if channel == "samsung": g_name = game_name + u"(samsung)" #g_name = game_name + u"(GG官方)" #adrooy #g_name = game_name + u"(samsung)" game_id = utils.gen_label_info_id(g_name) g_name = g_name.replace(u"(GG官方)", "") if ver_name: ver_name = filter(lambda ch: ch in '0123456789.', ver_name) if 'gameid' in apk_info: game_id = apk_info['gameid'] label_info["game_id"] = game_id label_info["game_name"] = g_name label_info["game_types"] = game_types label_info["origin_types"] = game_types label_info["screen_shot_urls"] = screen_shot_urls label_info["icon_url"] = icon_url label_info["detail_desc"] = game_desc label_info["star_num"] = utils.format_star_num(str(star_num), 2) label_info["download_counts"] = utils.format_install_num(downloaded_cnts) label_info["game_language"] = game_language label_info["now"] = now label_info["file_size"] = file_size label_info["ver_name"] = ver_name label_info["developer"] = developer pkg_info["market_channel"] = market_channel pkg_info["game_name"] = g_name pkg_info["pkg_name"] = pkg_name pkg_info["ver_code"] = ver_code pkg_info["ver_name"] = ver_name pkg_info["file_size"] = file_size pkg_info["download_urls"] = download_url.strip() pkg_info["game_desc"] = game_desc pkg_info["game_types"] = game_types pkg_info["origin_types"] = game_types pkg_info["downloaded_cnts"] = utils.format_install_num(downloaded_cnts) pkg_info["game_language"] = game_language pkg_info["screen_shot_urls"] = screen_shot_urls pkg_info["icon_url"] = icon_url pkg_info["now"] = now pkg_info["is_crack_apk"] = is_crack_apk if "ggvercode" not in apk_info: apk_info["ggvercode"] = "null" apk_id = utils.gen_pkg_info_id(0, pkg_name, ver_name, market_channel, apk_info["ggvercode"]) pkg_info["apk_id"] = apk_id pkg_info["game_id"] = game_id pkg_info["url4details"] = "https://play.google.com/store/apps/details?id=%s" % pkg_name #print apk_id, game_id #import sys #sys.exit() return label_info, pkg_info
def update(f, verbose=False): global data # # Read the NBS website # d0 = parse_raw(urllib.request.urlopen(URL)) # commented: Use a cached copy of the website codes = parse_raw(f) # Save the latest table with open(data_fn('latest'), 'w') as f1: w = csv.writer(f1, lineterminator=linesep) w.writerow(['code', 'name_zh', 'level']) for code in sorted(codes.keys()): w.writerow([code, codes[code]['name_zh'], codes[code]['level']]) # Load the CITAS table d1 = load_file(open(data_fn('citas'), 'r'), 'C-gbcode', lambda row: row['todate'] == '19941231') # Load the GB/T 2260-2007 tables, from two files d2 = load_file(open(data_fn('gbt_2260-2007'), 'r'), 'code') d3 = load_file(open(data_fn('gbt_2260-2007_sup'), 'r'), 'code') for code, d in d3.items(): # Merge the two GB/T 2260-2007 files if code in d2: # Code appears in both files # Don't overwrite name_zh in gbt_2260-2007.csv with an empty # name_zh from gbt_2260-2007_sup.csv dict_update(d2[code], d, conflict=lambda a, b, k: not(k == 'name_zh' or b is None)) else: # Code only appears in gbt_2260-2007_sup.csv d2[code] = d # Load extra data pertaining to the latest table d4 = load_file(open(data_fn('extra'), 'r'), 'code') # Merge using codes for code in sorted(codes.keys()): # Store debug information to be printed (or not) later message = '{}\t{}\n'.format(code, codes[code]['name_zh']) # Merge CITAS entry for this code if code not in d1: message += ' does not appear in CITAS data set\n' else: d = dict(d1[code]) # Make a copy name_zh = d.pop('N-hanzi') if not match_names(codes[code]['name_zh'], name_zh): message += ' CITAS name {} ({}) does not match\n'.format( name_zh, jianfan.ftoj(name_zh)) else: d['name_en'] = d.pop('N-local').replace("`", "'") d['name_pinyin'] = d.pop('N-pinyin').replace("`", "'") dict_update(codes[code], d) # Merge GB/T 2260-2007 entry for this code if code not in d2: message += ' does not appear in GB/T 2260-2007\n' else: d = dict(d2[code]) if (len(d['name_zh']) and not codes[code]['name_zh'] == d['name_zh']): message += ' GB/T 2260-2007 name {} does not match\n'.format( d['name_zh']) else: # Don't overwrite name_en from CITAS with empty name_en from # GB/T 2260-2007 dict_update(codes[code], d, conflict=lambda a, b, k: not( 'name_' in k and b is '')) # Merge extra data if code in d4: dict_update(codes[code], d4[code], conflict='squash') if verbose and message.count('\n') > 1: print(message, end='') # TODO merge on names # Write the unified data set to file with open(data_fn('unified'), 'w') as f: w = csv.DictWriter(f, ('code', 'name_zh', 'name_en', 'name_pinyin', 'alpha', 'level', 'latitude', 'longitude'), extrasaction='ignore', lineterminator=linesep) w.writeheader() for k in sorted(codes.keys()): w.writerow(codes[k])
def tradition_to_simple(str): return ftoj(str)
def update(version='2015-09-30', use_cache=False, verbose=False, target=None): """Update the database. :meth:`update` relies on four sources, in the following order of authority: 1. Error corrections from ``extra.csv``. 2. The latest list of codes from the NBS website indicated by *version* (see :meth:`parse_html`). For instance, *version* ‘2013-08-31’ was published on 2014-01-17. If *use_cache* is :py:data:`True`, then a cached HTML list is used from the the directory ``data/cache/`` (see :meth:`refresh_cache`). Otherwise, or if the cache is missing, the file is downloaded from the website. 3. The data set `GuoBiao (GB) Codes for the Administrative Divisions of the People's Republic of China, v1 (1982 – 1992) <http://sedac.ciesin.columbia.edu/data/set/ cddc-china-guobiao-codes-admin-divisions>`_ (``citas.csv``), produced by the NASA Socioeconomic Data and Applications Center (SEDAC), the University of Washington Chinese Academy of Surveying and Mapping (CASM), the Columbia University Center for International Earth Science Information Network (CIESIN) as part of the China in Time and Space (*CITAS*) project. This data set contains Pinyin transcriptions. 4. The information in ``gbt_2260-2007.csv`` (provided by `@qiaolun <https://github.com/qiaolun>`_) and ``gbt_2260-2007_sup.csv`` (supplement) transcribed from the published GB/T 2260-2007 standard. If *verbose* is :py:data:`True`, verbose output is given. The following files are updated: - ``latest.csv`` with information from source #1 only: codes, Chinese names (``name_zh``), and ``level``. - ``unified.csv`` with all database fields and information from sources #2, #3 and #4. - ``unified.db``, the same information in a :py:mod:`sqlite3` database. """ _configure_log(verbose) if use_cache: try: fn = data_fn(os.path.join('cache', version), 'html') log.info('reading from cached %s', fn) f = open(fn, 'r') except FileNotFoundError: log.info(' missing.') use_cache = False if not use_cache: from urllib.request import urlopen log.info('retrieving codes from %s', URLS[version]) f = urlopen(URLS[version]) # Parse the codes from HTML log.info(' parsing...') codes = parse_html(f, version.split('-')[0]) assert sorted(codes.keys()) == list(codes.keys()) log.info(' done.') # Save the latest table fn = data_fn('latest', path=target) with open(fn, 'w') as f1: w = csv.writer(f1, lineterminator=linesep) w.writerow(['code', 'name_zh', 'level']) for code in sorted(codes.keys()): w.writerow([code, codes[code]['name_zh'], codes[code]['level']]) log.info('wrote %s', fn) # Load the CITAS table d1 = load_csv('citas', 'C-gbcode', filter=lambda row: row['todate'] == '19941231') log.info('read CITAS data') # Load the GB/T 2260-2007 tables, from two files d2 = load_csv('gbt_2260-2007') d3 = load_csv('gbt_2260-2007_sup') log.info('loaded GB/T 2260-2007 entries') # Merge the two GB/T 2260-2007 files for code, d in d3.items(): if code in d2: # Code appears in both files. Don't overwrite name_zh in # gbt_2260-2007.csv with an empty name_zh from # gbt_2260-2007_sup.csv. _dict_update(d2[code], d, conflict=lambda a, b, k: not(k == 'name_zh' or b is None)) else: # Code only appears in gbt_2260-2007_sup.csv d2[code] = d # Load extra data pertaining to the latest table d4 = load_csv('extra') log.info('loaded extra data') # Regular expression for English names from the CITAS database: # In a name like 'Beijing: Dongcheng qu' the prefix 'Beijing: ' is a # repetition of the name of the parent division, and the suffix ' qu' is # the type, not the name, of the area. name_re = re.compile('(?:[^:]*: )?(.*?)(?: (%s))?$' % '|'.join(SUFFIXES)) pinyin = Pinyin() # Merge using codes log.info('merging codes') for code, entry in codes.items(): # Store debug information to be printed (or not) later message = ['%s\t%s' % (code, entry['name_zh'])] if code in d1: # Merge CITAS entry for this code # Make a copy d = dict(d1[code]) name_zh = d.pop('N-hanzi') if not _match_names(entry['name_zh'], name_zh): message.append(' CITAS name %s (%s) does not match' % (name_zh, jianfan.ftoj(name_zh))) else: d['name_en'] = d.pop('N-local').replace("`", "'") d['name_pinyin'] = d.pop('N-pinyin').replace("`", "'") _dict_update(entry, d) else: message.append(' does not appear in CITAS data set') if code in d2: # Merge GB/T 2260-2007 entry for this code d = dict(d2[code]) if len(d['name_zh']) and entry['name_zh'] != d['name_zh']: message.append(' GB/T 2260-2007 name %s does not match' % d['name_zh']) else: # Don't overwrite name_en from CITAS with empty name_en from # GB/T 2260-2007 _dict_update(entry, d, conflict=lambda a, b, k: not( 'name_' in k and b is '')) else: message.append(' does not appear in GB/T 2260-2007') # Merge extra data if code in d4: _dict_update(entry, d4[code], conflict='squash') # Clean up English names (in most cases, the CITAS romanized name) if entry['name_en'] is not None: # Replace ' shixiaqu' with ' city area', but do not discard name_en = entry['name_en'].replace(' shixiaqu', ' city area') # Use regex to discard prefixes and suffixes on names entry['name_en'] = name_re.match(name_en).group(1) elif entry['name_zh'] == '市辖区': # Fill in blank with 'CITYNAME city area', where possible pname = codes[_parents(code)[1]]['name_en'] entry['name_en'] = None if pname is None else pname + ' city area' # Fill in pinyin names if entry['name_pinyin'] is None: entry['name_pinyin'] = pinyin.get_pinyin(entry['name_zh'], '').title() if len(message) > 1 and 'does not appear in CITAS' not in message[1]: log.info('\n'.join(message)) else: log.debug('\n'.join(message)) log.info('merge complete') # Write the unified data set to CSV fn = data_fn('unified', path=target) with open(fn, 'w') as f: w = csv.DictWriter(f, ('code', 'name_zh', 'name_en', 'name_pinyin', 'alpha', 'level', 'latitude', 'longitude'), extrasaction='ignore', lineterminator=linesep) w.writeheader() for k in sorted(codes.keys()): w.writerow(codes[k]) log.info('wrote %s', fn) write_sqlite('unified', codes, target=target) log.info('wrote sqlite3 database')
def evidence_extaction(self): # ************* batch segment long article ************* # start = time.time() if os.path.exists('data/.tmp/'): shutil.rmtree('data/.tmp') os.makedirs('data/.tmp/') # create a temperal dir for parsing large paragraphs for doc_id in self.cleaned_docs: f = io.open(os.path.join('data/.tmp', doc_id), 'w', -1, 'utf-8') f.write(self.cleaned_docs[doc_id]) f.close() # run stanford segmenter stanford_nlp_dir = os.path.join(self.CN_SF_PATH, 'externals/stanford-corenlp-full-2014-08-27/') segmenter_result = list(batch_parse('data/.tmp/', stanford_nlp_dir, properties=os.path.join(stanford_nlp_dir, "StanfordCoreNLP-chinese.Segmenter.properties") )) for r in segmenter_result: self.segmented_docs[r['file_name']] = r['sentences'] print('segmenting time cost '+str(time.time()-start)) # cpickle for development # cPickle.dump(self.segmented_docs, open('data/segmented_docs.pkl', 'wb')) # self.segmented_docs = cPickle.load(open('data/segmented_docs.pkl', 'rb')) # ************* select evidence ************* # sent_to_parse = dict() self.evidence = OrderedDict() for query in self.queries: print('\textracting ' + query.name) evidences = OrderedDict() # {slot_type: sentence_parsed_result} for doc_id in self.query_docs[query.id].keys(): seg_result = self.segmented_docs[doc_id] for i in xrange(len(seg_result)): # sentence is stanford standard format output sentence = seg_result[i] sent_id = '|'.join([doc_id, str(i)]) # if sentence is too long or too short, it carries less dependency information if len(sentence['words']) > 130 or len(sentence['words']) < 3: continue sent_text = ''.join(sentence['text']) # *************** check if this sentence is an evidence ******************** # # ============== common case ============= # seg_sent_text = sentence['text'] # list of tokens seg_sent_text = [jianfan.ftoj(w) for w in seg_sent_text] # here joining s['text'] list will overcome segmentation errors if query.name not in ''.join(seg_sent_text): continue triggers = self.triggers[query.entity_type] if query.entity_type == 'PER': slot_types = self.PER_SLOT_TYPE elif query.entity_type == 'ORG': slot_types = self.ORG_SLOT_TYPE for slot_type in slot_types: if slot_type not in evidences.keys(): evidences[slot_type] = [] for t in triggers[slot_type]: # compare triggers to words by segmentation, might affected by segmentation errors if t not in seg_sent_text: continue evidences[slot_type].append(Evidence(doc_id, query.id, t, sent_text, sent_id)) sent_to_parse[sent_id] = sent_text # add sentence and do parallel parsing later. # ============== special case ============== # if query.entity_type == 'PER': evidences['per:alternate_names'].append(Evidence(doc_id, query.id, '', sent_text, sent_id, sentence)) if query.entity_type == 'ORG': # for org:alternate_names, the article contains the query is evidence, for pattern match evidences['org:alternate_names'].append(Evidence(doc_id, query.id, '', sent_text, sent_id, sentence)) # for org:XXX_headquarters, the article contains the query is evidence, for pattern match evidences['org:country_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) evidences['org:stateorprovince_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) evidences['org:city_of_headquarters'].append((Evidence(doc_id, query.id, '', sent_text, sent_id, sentence))) self.evidence[query.id] = evidences # *************** parallel parsing ****************** # def chunkIt(seq, num): avg = len(seq) / float(num) out = [] last = 0.0 while last < len(seq): out.append(seq[int(last):int(last + avg)]) last += avg return out # run stanford parser in multiprocessing process_num = multiprocessing.cpu_count() / 2 if multiprocessing.cpu_count() / 2 < 10 else 10 p = multiprocessing.Pool(processes=process_num) chunked_sent = [dict(item) for item in chunkIt(sent_to_parse.items(), process_num)] mp_result = [p.apply_async(stanford_parser, args=(chunked_sent[i], str(i))) for i in range(process_num)] mp_result = [p.get() for p in mp_result] sent_parsing_result = {} for r in mp_result: sent_parsing_result.update(r) # cpickle for development # cPickle.dump(sent_parsing_result, open('data/sent_parsing_result.pkl', 'wb')) # sent_parsing_result = cPickle.load(open('data/sent_parsing_result.pkl', 'rb')) # updating evidences for q_id in self.evidence.keys(): evidences = self.evidence[q_id] for slot_type in evidences.keys(): for e in evidences[slot_type]: if not e.trigger: continue e.parse_result = sent_parsing_result[e.sent_id] # *************** correct segmenter error ******************** # china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb')) province_city_list = [] for p in china_province_city: province_city_list += [p['name']] for c in p['sub']: province_city_list += [c['name']] if p['type'] == 0: continue for d in c['sub']: province_city_list += [d['name']] for q_id in self.evidence.keys(): for slot_type in self.evidence[q_id]: for i in xrange(len(self.evidence[q_id][slot_type])): self.evidence[q_id][slot_type][i] = self.correct_evidence(self.find_query(q_id).name, self.evidence[q_id][slot_type][i]) for p_or_c in province_city_list: if len(p_or_c) > 2 and p_or_c in \ ''.join(self.evidence[q_id][slot_type][i].parse_result['text']): self.evidence[q_id][slot_type][i] = \ self.correct_evidence(p_or_c, self.evidence[q_id][slot_type][i]) print('Done')
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country + state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output + [l, province] return current_output + [l]
def country(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] province = None # find query's province and city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: province = line_output if province is None: return current_output # infer country by province country = '' evidence = '' # evidence is a LineOutput object state_slot_filler = jianfan.ftoj(province.slot_filler) for c in self.world_coutry_province: if state_slot_filler in self.world_coutry_province[c]: country = c evidence = province break # if inference fails, return original answer if not country: return current_output # search provenance found_doc_path = search(country + state_slot_filler, self.sf_object.lucene_searcher, self.sf_object.lucene_analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(country + state_slot_filler) wp_end = wp_beg + len(country + state_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end+1].find(country) sp_end = sp_beg + len(country) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = country+state_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = country p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = country l.slot_filler_prov = [p] l.confidence_score = 1 # if province is 台湾, coutry should also add 台湾 if u'台湾' in jianfan.ftoj(province.slot_filler): return current_output+[l, province] return current_output+[l]
def stateorprovince(self, slot_type, evidence_slot_type): current_output = self.query_answer.output[slot_type] city = None # find query's city answer. for line_output in self.query_answer.output[evidence_slot_type]: if line_output.slot_filler: city = line_output if city is None: return current_output # infer province by city province = '' evidence = '' # evidence is a LineOutput object city_slot_filler = city.slot_filler city_slot_filler = jianfan.ftoj(city_slot_filler) for r in [u'区', u'县', u'市']: city_slot_filler = city_slot_filler.replace(r, '') for p in self.china_province_city: if province: break if p['type'] == 0: if city_slot_filler in [item['name'] for item in p['sub']]: province = p['name'] evidence = city break else: for c in p['sub']: if city_slot_filler in [item['name'] for item in c['sub']]: province = p['name'] evidence = city break # if inference fails, return original answer if not province: return current_output # search provenance found_doc_path = search(province + city_slot_filler, self.searcher, self.analyzer, 50) if not found_doc_path: return current_output evidence_doc_path = found_doc_path[0] # add additional doc to source_doc for visualization doc_id = evidence_doc_path.split('/')[-1].strip() doc = io.open(evidence_doc_path, 'r', -1, 'utf-8').read() self.sf_object.query_docs[doc_id] = doc wp_beg = doc.find(province + city_slot_filler) wp_end = wp_beg + len(province + city_slot_filler) - 1 sp_beg = wp_beg + doc[wp_beg:wp_end + 1].find(province) sp_end = sp_beg + len(province) - 1 l = LineOutput() l.slot_type = slot_type l.run_id = self.query_answer.run_id p = Provenance() p.doc_id = doc_id p.beg = wp_beg p.end = wp_end p.text = province + city_slot_filler l.wide_provenance = [p] evidence.wide_provenance[0].inference = True l.wide_provenance += evidence.wide_provenance # evidence is a LineOutput object l.slot_filler = province p = Provenance() p.doc_id = doc_id p.beg = sp_beg p.end = sp_end p.text = province l.slot_filler_prov = [p] l.confidence_score = 1 return current_output + [l]
# -*- coding:utf-8 import sys from jianfan import jtof, ftoj f = open(sys.argv[1]) s = ftoj(("".join(f.readlines())).decode("utf-8")) s2 = s.split(" ") f.close() f2 = open(sys.argv[2], "w") f2.write(("%s\t%s" % (s2[0], int(sys.argv[4]))).encode("utf-8")) f2.close() f3 = open(sys.argv[3], "w") f3.write(("%s|||%s|||%s" % (s2[1], s2[2], s2[3])).encode("utf-8")) f3.close()