def get_glyph_name(font: TTFont, codepoint: int) -> Optional[str]: next_best_cmap = font.getBestCmap() if codepoint in next_best_cmap: return next_best_cmap[codepoint] return None
url = 'https://bj.58.com/haidian/pinpaigongyu/pn/{0}/?minprice=2000_3000' page = 0 file_dir = "{0}".format(os.getcwd()) path = os.path.join(file_dir,"BJ_haidian_2000_3000.csv") columns = ['name','location','price','url'] house_data = [] while True: page += 1 resp = requests.get(url.format(page),headers=random.choice(headers)) if resp: base64_str = re.findall('data:application/font-ttf;charset=utf-8;base64,(.*?)\'\) format\(\'truetype\'\)}',resp.text) bin_data = base64.b64decode(base64_str[0]) fonts = TTFont(io.BytesIO(bin_data)) bestcmap = fonts.getBestCmap() newmap = {} for key in bestcmap.keys(): value = int(re.findall(r'(\d+)', bestcmap[key])[0]) - 1 key = hex(key) newmap[key] = value print('==========', newmap) resp_ = resp.text doc = pq(resp_) house_list = doc.find('.list li') if not house_list: break #print(doc.find('.page a').eq(-2).text()) for each in house_list.items():
class UFOFont(BaseFont): ufoState = None def resetCache(self): super().resetCache() del self.defaultVerticalAdvance del self.defaultVerticalOriginY del self.globalColorLayerMapping def _setupReaderAndGlyphSet(self): self.reader = UFOReader(self.fontPath, validate=False) self.glyphSet = self.reader.getGlyphSet() self.glyphSet.glyphClass = Glyph self.layerGlyphSets = {} async def load(self, outputWriter): if hasattr(self, "reader"): self._cachedGlyphs = {} return self._setupReaderAndGlyphSet() self.info = SimpleNamespace() self.reader.readInfo(self.info) self.lib = self.reader.readLib() self._cachedGlyphs = {} if self.ufoState is None: includedFeatureFiles = extractIncludedFeatureFiles( self.fontPath, self.reader) self.ufoState = UFOState( self.reader, self.glyphSet, getUnicodesAndAnchors=self._getUnicodesAndAnchors, includedFeatureFiles=includedFeatureFiles) fontData = await compileUFOToBytes(self.fontPath, outputWriter) f = io.BytesIO(fontData) self.ttFont = TTFont(f, lazy=True) self.shaper = self._getShaper(fontData) def updateFontPath(self, newFontPath): """This gets called when the source file was moved.""" super().updateFontPath(newFontPath) self._setupReaderAndGlyphSet() def getExternalFiles(self): return self.ufoState.includedFeatureFiles def canReloadWithChange(self, externalFilePath): if self.reader.fileStructure != UFOFileStructure.PACKAGE: # We can't (won't) partially reload .ufoz return False if externalFilePath: # Features need to be recompiled no matter what return False self.glyphSet.rebuildContents() self.ufoState = self.ufoState.newState() (needsFeaturesUpdate, needsGlyphUpdate, needsInfoUpdate, needsCmapUpdate, needsLibUpdate) = self.ufoState.getUpdateInfo() if needsFeaturesUpdate: return False if needsInfoUpdate: # font.info changed, all we care about is a possibly change unitsPerEm self.info = SimpleNamespace() self.reader.readInfo(self.info) if needsCmapUpdate: # The cmap changed. Let's update it in-place and only rebuild the shaper newCmap = { code: gn for gn, codes in self.ufoState.unicodes.items() for code in codes } fb = FontBuilder(font=self.ttFont) fb.setupCharacterMap(newCmap) f = io.BytesIO() self.ttFont.save(f, reorderTables=False) self.shaper = self._getShaper(f.getvalue()) if needsLibUpdate: self.lib = self.reader.readLib() # We don't explicitly track changes in layers, but they may be involved # in building layered color glyphs, so let's just always reset the cache. self.resetCache() return True def _getUnicodesAndAnchors(self): unicodes = defaultdict(list) for code, gn in self.ttFont.getBestCmap().items(): unicodes[gn].append(code) anchors = pickle.loads(self.ttFont["FGAx"].data) return unicodes, anchors def _getShaper(self, fontData): return HBShape(fontData, getHorizontalAdvance=self._getHorizontalAdvance, getVerticalAdvance=self._getVerticalAdvance, getVerticalOrigin=self._getVerticalOrigin, ttFont=self.ttFont) @cachedProperty def unitsPerEm(self): return self.info.unitsPerEm def _getGlyph(self, glyphName, layerName=None): glyph = self._cachedGlyphs.get((layerName, glyphName)) if glyph is None: if glyphName == ".notdef" and glyphName not in self.glyphSet: # We need a .notdef glyph, so let's make one. glyph = NotDefGlyph(self.info.unitsPerEm) self._addOutlinePathToGlyph(glyph) else: try: if layerName is None: glyph = self.glyphSet[glyphName] else: glyph = self.getLayerGlyphSet(layerName)[glyphName] self._addOutlinePathToGlyph(glyph) except Exception as e: # TODO: logging would be better but then capturing in mainWindow.py is harder print(f"Glyph '{glyphName}' could not be read: {e!r}", file=sys.stderr) glyph = self._getGlyph(".notdef") self._cachedGlyphs[(layerName, glyphName)] = glyph return glyph def _addOutlinePathToGlyph(self, glyph): pen = CocoaPen(self.glyphSet) glyph.draw(pen) glyph.outline = pen.path def _getHorizontalAdvance(self, glyphName): glyph = self._getGlyph(glyphName) return glyph.width @cachedProperty def defaultVerticalAdvance(self): ascender = getattr(self.info, "ascender", None) descender = getattr(self.info, "descender", None) if ascender is None or descender is None: return self.info.unitsPerEm else: return ascender + abs(descender) @cachedProperty def defaultVerticalOriginY(self): ascender = getattr(self.info, "ascender", None) if ascender is None: return self.info.unitsPerEm # ??? else: return ascender def _getVerticalAdvance(self, glyphName): glyph = self._getGlyph(glyphName) vAdvance = glyph.height if vAdvance is None or vAdvance == 0: # XXX default vAdv == 0 -> bad UFO spec vAdvance = self.defaultVerticalAdvance return -abs(vAdvance) def _getVerticalOrigin(self, glyphName): glyph = self._getGlyph(glyphName) vOrgX = glyph.width / 2 lib = getattr(glyph, "lib", {}) vOrgY = lib.get("public.verticalOrigin") if vOrgY is None: vOrgY = self.defaultVerticalOriginY return True, vOrgX, vOrgY def _getGlyphDrawing(self, glyphName, colorLayers): glyph = self._getGlyph(glyphName) if colorLayers: colorLayerMapping = glyph.lib.get(COLOR_LAYER_MAPPING_KEY) if colorLayerMapping is None: colorLayerMapping = self.globalColorLayerMapping if colorLayerMapping is not None: layers = [] for layerName, colorID in colorLayerMapping: glyph = self._getGlyph(glyphName, layerName) if not isinstance(glyph, NotDefGlyph): layers.append((glyph.outline, colorID)) if layers: return GlyphDrawing(layers) return GlyphDrawing([(glyph.outline, None)]) @cachedProperty def colorPalettes(self): return self.lib.get(COLOR_PALETTES_KEY) @cachedProperty def globalColorLayerMapping(self): return self.lib.get(COLOR_LAYER_MAPPING_KEY) def getLayerGlyphSet(self, layerName): layerGlyphSet = self.layerGlyphSets.get(layerName) if layerGlyphSet is None: layerGlyphSet = self.reader.getGlyphSet(layerName) self.layerGlyphSets[layerName] = layerGlyphSet return layerGlyphSet
"cid10920": '8', "cid00026": '9', "cid00771": '9', "cid00939": '9', "cid00919": '9', "cid01068": '9', "cid26924": '9', "cid19425": '8', "cid00783": '9', "cid01923": '9', "cid09631": '9', "cid02040": '9', "cid00959": "9", } dict_ = {} for k, v in font.getBestCmap().items(): k = hex(k).replace('0x', '\\u').encode('utf-8').decode('unicode_escape') dict_[k] = v html = etree.HTML(text) nums = html.xpath("//div[@class='col-md-1']/text()") import re for num in nums: list_ = [] list_2 = [] num = re.findall(r"[\u4e00-\u9fa5]+", num) for n in num[0]: s.add(dict_[n]) list_2.append(dict_[n]) list_3.append(dict_[n]) list_.append(d[dict_[n]])
# # 从网络上抓取网页源代码,然后获取code->name->文字形状 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } resp = requests.get("https://cs.58.com/chuzu/", headers=headers) text = resp.text # 抓取网页的字体文件 font_face = re.search(r"@font-face.+base64,(.+?)'\)", text).group(1) # 保存到内存中 font_bytes = io.BytesIO(base64.b64decode(font_face)) currentFont = TTFont(font_bytes) # code->name # 获取字体的code和name的映射 codeNameMap = currentFont.getBestCmap() # 获取当前网页字体的所有字体的形状 currentGlyf = currentFont['glyf'] # 循环code和name的映射 for code, name in codeNameMap.items(): # 先获取当前网页,某个name下的形状 currentShape = currentGlyf[name] # 循环字典,并找到映射对网页进行替换并保存 for number, shape in baseFontMap.items(): if currentShape == shape: # 得到的code是十进制,这里转为十六进制进行处理 webcode = str(hex(code)).replace("0", "&#", 1) + ";" text = re.sub(webcode, str(number), text) # with open("58_1.html", 'w', encoding='utf-8') as fp: # fp.write(text)
def parse(self, response): text = response.xpath('//div[@class="conttxt"]/div[1]').xpath( 'string(.)').extract()[0] print(text) font_url = 'https:' + re.search(r",url\('(.*\.ttf)'\)", response.text, re.S).group(1) # test = re.search(r' layer1="text-s"></div>新车已经行驶5500公里,现给<span.*?>(.*?)</span>', response.text, re.S) # text = response.xpath('//div[@class="conttxt"]/div[1]').xpath('string(.)').extract() # bb = "".join(text) # templetpate = re.search(r'<div class="w740">(.*?)<a href=".*?" name="shang"></a>', response.text, re.S).group(1) # cc = response.xpath('//div[@class="conttxt"]/div[1]//text()').extract() # cc = ''.join(cc) # print(bb) # print(cc) # # print(font_url) with open('online_qc.ttf', 'wb') as f: f.write(requests.get(font_url).content) base_font = TTFont('qiche.ttf') # base_font.saveXML('qiche.xml') base_uni = base_font.getGlyphOrder()[1:] print('base_uni', base_uni) online_font = TTFont('online_qc.ttf') # online_font.saveXML('online_qc.xml') online_uni = online_font.getGlyphNames()[1:] print('online_uni', online_uni) bm = online_font.getBestCmap() print('bm', bm) dict_font = { 'uniEC1B': '八', 'uniEC6D': '大', 'uniEDAE': '右', 'uniECFA': '十', 'uniED4C': '呢', 'uniEC99': '四', 'uniECEB': '小', 'uniEC37': '好', 'uniED78': '三', 'uniEDCA': '是', 'uniED16': '短', 'uniEC63': '五', 'uniECB5': '下', 'uniEDF5': '少', 'uniEC53': '近', 'uniED94': '长', 'uniECE0': '地', 'uniED32': '多', 'uniEC7F': '更', 'uniEDBF': '左', 'uniEC1D': '不', 'uniED5E': '矮', 'uniEDAF': '和', 'uniECFC': '高', 'uniEC49': '一', 'uniEC9A': '很', 'uniEDDB': '的', 'uniED28': '六', 'uniED79': '得', 'uniECC6': '七', 'uniED18': '坏', 'uniEC64': '着', 'uniEDA5': '九', 'uniEDF7': '上', 'uniED43': '远', 'uniEC90': '低', 'uniECE2': '了', 'uniEC2E': '二' } temp = {} for bs_uni in base_uni: base_obj = base_font['glyf'][bs_uni] for ol_uni in online_uni: online_obj = online_font['glyf'][ol_uni] ol = ol_uni[3:] if base_obj == online_obj: temp[eval(r"u'\u" + ol.lower() + "'")] = dict_font[bs_uni] # for i in range(38): # base_obj = base_font['glyf'][base_uni[i]] # for j in range(38): # ol_obj = online_font['glyf'][online_uni[j]] # if base_obj == ol_obj: # # temp["&#x" + online_uni[j][3:].lower() + ';'] = dict_font[base_uni[i]] # temp[eval(r"u'\u" + online_uni[j][3:].lower() + "'")] = dict_font[base_uni[i]] print(temp) # pat = '(' + '|'.join(temp.keys()) + ')' # text = re.sub(pat, lambda x: temp[x.group()], text) for i in range(38): text = text.replace(list(temp.keys())[i], list(temp.values())[i]) print(text)
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", } r = requests.get(url, headers=headers) with open("./font.woff", "wb") as f: f.write(r.content) url = "https://static.tianyancha.com/fonts-styles/fonts/b1/b17d9d87/tyc-num.ttf" headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", } r = requests.get(url, headers=headers) with open("./font.ttf", "wb") as f: f.write(r.content) online_fonts = TTFont('font.woff') online_fonts.saveXML("text.xml") _dict = online_fonts.getBestCmap() print("字典:", _dict) # online_fonts = TTFont('font.tff') # # online_fonts.saveXML("text.xml") # # _dict = online_fonts.getBestCmap() # # print("字典:", _dict)
def parse(self, response): title_urls = response.css( 'div.des h2 a.strongbox::attr(href)').extract() res_html = response.text title = r'(?:target="_blank" |target="_blank" rel="nofollow" )>(.*?)</a>' titles = re.findall(title, res_html, re.S | re.M) room = r'<p class="room">(.*?)</p>' rooms = re.findall(room, res_html, re.S | re.M) money = r'<b class="strongbox">(.*?)</b>' moneys = re.findall(money, res_html, re.S | re.M) # 处理信息 titles = [c.replace('\n', '') for c in titles] titles = [c.replace(' ', '') for c in titles] rooms = [c.replace(' ', '') for c in rooms] rooms = [c.replace(' ', ' ') for c in rooms] sizes = [] looks = [] for i in range(0, len(rooms)): looks.append(rooms[i].split(" ", 1)[0]) sizes.append(rooms[i].split(" ", 1)[1]) sizes = [c.replace(' ', '') for c in sizes] sizes = [c.replace('\n', '') for c in sizes] font_pattern = r"base64,(.*?)format" font_base64 = re.findall(font_pattern, res_html, re.S | re.M) # 解码数字 str_base64 = font_base64[0][:-3] bin_data = base64.decodebytes(str_base64.encode()) with open("font.woff", r"wb") as f: f.write(bin_data) onlineFonts = TTFont('font.woff') self.dict = onlineFonts.getBestCmap() # 数字解决 for i in range(len(titles)): titles[i] = self.convert_title_room(titles[i]) for i in range(len(rooms)): looks[i] = self.convert_title_room(looks[i]) for i in range(len(rooms)): sizes[i] = self.convert_title_room(sizes[i]) for i in range(len(moneys)): moneys[i] = self.convert_money(moneys[i]) house_item = HouseItem() house_item["title_urls"] = [title_urls] house_item["titles"] = [titles] house_item["moneys"] = [moneys] house_item["looks"] = [looks] house_item["sizes"] = [sizes] # 解析具体页 for title_url in title_urls: yield Request(url=title_url, callback=self.parse_detail) # 下一页 # next_urls = response.css('div.pager a.next::attr(href)').extract() # if next_urls: # yield Request(url=next_urls, callback=self.parse) yield house_item
def do_build(opt): PBAR_desc('prepare', opt['dst']) font = TTFont(opt['src']) glyph_map = {} fwid = font['GSUB'].table.LookupList.Lookup[0].SubTable[0].mapping hwid = font['GSUB'].table.LookupList.Lookup[1].SubTable[0].mapping for code, name in font.getBestCmap().items(): pos = 0 if code < 0xF0000 else 1 if code < 0x100000 else 2 code &= 0xFFFF if code not in glyph_map: glyph_map[code] = [None, None, None, None, None, None] # 0 : src half normal # 1 : src half italic # 2 : src full normal glyph_map[code][pos] = name for code, row in glyph_map.items(): eaw = unicodedata.east_asian_width(chr(code)) norm = row[0] or row[1] ital = row[1] or row[0] full = row[2] if norm and full: fwid[norm] = full hwid[full] = norm if eaw in ('H', 'Na'): row[:] = norm, ital, norm, ital, norm, ital elif eaw in ('F', 'W'): row[:] = full, full, full, full, full, full elif eaw == 'N': row[:] = norm, ital, norm, ital, full, full elif eaw == 'A': row[:] = norm, ital, full, full, full, full maps = [{code: row[x] for code, row in glyph_map.items() if row[x]} for x in range(6)] font['OS/2'].xAvgCharWidth = 1024 font['OS/2'].panose.bProportion = 9 font['OS/2'].ulCodePageRange1 |= 0x00020000 font['OS/2'].ulCodePageRange1 ^= 0x00000004 font['OS/2'].ulCodePageRange2 ^= 0x00020000 font['OS/2'].ulUnicodeRange3 ^= 0x04C00000 font['post'].isFixedPitch = 1 del font['FFTM'] del font['GPOS'] PBAR.update(1) for i in range(6): PBAR_desc('generate', f'{i}.ttf') i_map = maps[i] i_opt = opt['font'][i] full_table = font['cmap'].getcmap(3, 10) full_cmap = full_table.cmap full_cmap.clear() base_table = font['cmap'].getcmap(3, 1) base_cmap = base_table.cmap base_cmap.clear() for code, name in i_map.items(): full_cmap[code] = name if code <= 0xFFFF: base_cmap[code] = name font['head'].macStyle = i_opt['macStyle'] font['post'].italicAngle = i_opt['italicAngle'] font['OS/2'].fsSelection = i_opt['fsSelection'] font['OS/2'].usWeightClass = i_opt['usWeightClass'] font['OS/2'].panose.bWeight = i_opt['panoseWeight'] font['OS/2'].panose.bLetterForm = i_opt['panoseLetterForm'] for record in font['name'].names: if record.nameID in i_opt: record.string = i_opt[record.nameID] font.save(opt['ttf'][i]) PBAR.update(1) PBAR_desc('otf2otc', opt['dst']) command = ['otf2otc', '-o', opt['dst']] + opt['ttf'] run(command, stdout=DEVNULL) PBAR.update(1)
# # print(my_dic) map_str_2_number = { 'period': '.', 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9 } font_value = font.getBestCmap() print(font_value) for key in font_value.keys(): font_value[key] = map_str_2_number[font_value[key]] print(font_value) for key, value in font_value.items(): text = text.replace("&#" + str(key) + ";", str(value)) with open('2.html', 'w') as f: f.write(text)
from fontTools.ttLib import TTCollection, TTFont from fontTools.unicode import Unicode from itertools import chain import json if __name__ == "__main__": f = TTFont( 'D:/work/git/CycleGan-handwriting_generation/data/fonts/simhei.ttf') cmap = f.getBestCmap() # look up the encoding list_char_of_font = set() for char in sorted(cmap): list_char_of_font.add(chr(char)) # print(len(cmap)) # print(list_char_of_font) list_char_of_hw = set() c = 0 with open('labels.json', encoding='utf-8') as f: json_load = json.load(f) for line in json_load: c += 1 text = json_load[line] for char in list(text): if char not in list_char_of_hw: list_char_of_hw.add(char) print('len of font: ', len(list_char_of_font)) print('len of hw: ', len(list_char_of_hw)) result = list_char_of_font - list_char_of_hw print('len list character miss: ', len(result))
def get_novelcontent(self, response): myPgae = response.body targentcontent = response.meta['targentcontent'] unicodePage = myPgae.decode('utf-8') selector = Selector(response) url = response.meta['url'] # 获取页面内容 r = requests.get(url) response = html.fromstring(r.text) # print response # 匹配ttf font,获取随机生成的字体url,https://qidian.gtimg.com/qd_anti_spider/woqFfmqF.woff cmp = re.compile("url\('(//.*.woff)'\) format\('woff'\)") rst = cmp.findall(r.text) fontUrl = str(rst[0]).split('\'')[8] # print fontUrl fontName = fontUrl.split('/')[4].split(".")[0] # print fontName # ttf = requests.get("http:" + fontUrl, stream=True) ttf = requests.get(fontUrl, stream=True) with open("./font/qidian.woff", "wb") as pdf: for chunk in ttf.iter_content(chunk_size=1024): if chunk: pdf.write(chunk) # 解析字体库font文件 font = TTFont('./font/qidian.woff') uniList = font['cmap'].tables[0].ttFont.getGlyphOrder() # print uniList # ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] cmap = font.getBestCmap() # print(cmap) # {100064: 'one', 100065: 'six', 100066: 'two', 100067: 'three', 100068: 'five', 100069: 'nine', 100070: 'eight', 100071: 'four', 100072: 'period', 100061: 'seven', 100063: 'zero'} # for num, un_size in enumerate(cm): # print(un_size, num) # 上面3个方法都可以获得字符集,通过对比网页上的字符返回,可以发现getGlyphOrder()是按数字顺序返回。 # print(font.getGlyphNames()) # print(font.getGlyphNames2()) # print(font.getGlyphOrder()) # ['.notdef', 'period', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] # 使用 getGlyphOrder() 获取各数字的字符,并生成字典 tmp_dic glyphs = font.getGlyphOrder()[2:] tmp_dic = {} for num, un_size in enumerate(glyphs): # print(un_size, num) font_uni = un_size.replace('uni', '0x').lower() tmp_dic[font_uni] = num tmp_dic['period'] = "." numbers = re.findall(r'<span class="' + fontName + '">(.*?)</span>', unicodePage, re.S) # 获取当前页面的Table if numbers is not None: serialnumber = numbers[0] serialnumber = self.decode(serialnumber, cmap, tmp_dic) # 历史点击 click_num_total = numbers[1] click_num_total = self.decode(click_num_total, cmap, tmp_dic) # 本月点击 click_num_month = numbers[2] click_num_month = self.decode(click_num_month, cmap, tmp_dic) # 历史收藏 collect_num_total = numbers[3] collect_num_total = self.decode(collect_num_total, cmap, tmp_dic) targentcontent['serialnumber'] = int(serialnumber) targentcontent['click_num_total'] = int(click_num_total) targentcontent['click_num_month'] = int(click_num_month) * 4 targentcontent['collect_num_total'] = int(collect_num_total) yield targentcontent
class ParseTTFFont: baidu = Baidu(access_token=ACCESS_TOKEN) def __init__(self, font, ignore_names=[], overwrite_ignore=False): if isinstance(font, str): self.font = TTFont(font) elif isinstance(font, bytes): self.font = TTFont(BytesIO(font)) else: raise ValueError('unknown font type') self.glyphnames = self.font.getGlyphOrder() self.ignore_names = ignore_names if overwrite_ignore else ignore_names + IGNORE_NAMES def parse_fonts(self, project): """ 根据ttf所属项目找到其对应的结果集json解析文件,如果是新的project,将会自动生产结果集 :param project: 文件所属项目 :return: """ project = project + ".json" json_path = os.path.join(os.path.join(os.path.dirname(__file__), "font_jsons"), project) if os.path.exists(json_path): with open(json_path, "r", encoding="utf-8") as f: font_json = json.load(f) else: font_json = self.get_fonts_by_orc() logging.info(f"结果json保存路径为:{json_path}") # 保存 with open(json_path, "w", encoding="utf-8") as f: json.dump(font_json, f) fonts_coordinate_matrix = self.get_font_message() temp_dict, result = {}, {} for item in fonts_coordinate_matrix: if not isinstance(temp_dict.get(item.get("endPtsOfContours")), list): temp_dict[item.get("endPtsOfContours")] = [item] else: temp_dict[item.get("endPtsOfContours")].append(item) for key, item_list in temp_dict.items(): if len(item_list) == 1: # 可以直接根据endPtsOfContours 进行区分 result[item_list[0].get("glyphname")] = font_json.get(item_list[0].get("endPtsOfContours")) else: # 有多个字体拥有相同的endPtsOfContours, 那么使用像素坐标点的平均值排序进行区分字体(可能出现意外情况导致识别错误) item_list.sort(key=lambda item: self.avg(item["coordinates"])) for index, item in enumerate(item_list): result[item.get("glyphname")] = font_json.get(item.get("endPtsOfContours"))[index] cmaps = self.font.getBestCmap() fonts_result = {} for cmap_id, glyname in cmaps.items(): fonts_result[hex(cmap_id)] = result.get(glyname) return fonts_result def get_fonts_by_orc(self): """ 根据font文件获取 每个字体ID所对应的文字 通过字体的contoursOfPts得到已知字体字典,如果contoursOfPts值出现一样的,则对比一个最佳坐标点进行辨认 所谓最佳坐标点,在这里是取一个坐标差值很大的点。 :param fonts_coordinate_matrix: :return: font_json 返回一个已知的字体json文件 """ fonts_coordinate_matrix = self.get_coordinate_matrix_and_value() temp_dict, result = {}, {} for item in fonts_coordinate_matrix: if not isinstance(temp_dict.get(item.get("endPtsOfContours")), list): temp_dict[item.get("endPtsOfContours")] = [item] else: temp_dict[item.get("endPtsOfContours")].append(item) for endPtsOfContours, item_list in temp_dict.items(): if len(item_list) == 1: result[endPtsOfContours] = item_list[0].get("value") else: item_list.sort(key=lambda item: self.avg(item["coordinates"])) result[endPtsOfContours] = [item.get("value") for item in item_list] return result def accurate_basic(self): """ 使用百度API接口获取转成图片后的ttf, DEBUG模式下可以补入未识别到的文字 :return: """ word_list = [] image, name_list, image_dict = self.ttf_to_image() response = self.baidu.accurate_basic_of_pillow(image) print(response.json()) [word_list.extend(list(words.get("words"))) for words in response.json().get("words_result")] logging.info(f"百度识图结果:{word_list}") words = dict(zip(name_list, word_list)) if len(word_list) is not len(name_list): # 有未识别到的字,数量少可以手动添加,数量大,拜拜 warnings.warn("words length is not equal to gly length,") if DEBGU: # 非debug模式,忽略识别失败的字体 for glyname, faild in self.get_orc_faild_font(words, name_list, image_dict).items(): faild.show() word = input("请输入图片中显示的文字:") words[glyname] = word return words def get_coordinate_matrix_and_value(self): """ 百度orc识别文字,生产特征字典 :return: """ words = self.accurate_basic() fonts_coordinate_matrix = [] for glyphname, word in words.items(): if glyphname[0] in ['.', 'g'] or glyphname in self.ignore_names: # 跳过'.notdef', '.null' continue item = {} glyph = self.font['glyf'][glyphname] item["coordinates"] = glyph.coordinates._a.tolist() item["endPtsOfContours"] = base64.b64encode(str(glyph.endPtsOfContours).encode("utf-8")).decode("utf-8") item["value"] = word fonts_coordinate_matrix.append(item) return fonts_coordinate_matrix def get_font_message(self): """ 获取字体文件信息 coordinates: 该字体所有x,y坐标(固定顺序) [x,y,x1,y1,x2,y2...] endPtsOfContours: 根据contours和pt对应关系,形成的列表,然后编码为base64作为ID 始 endPtsOfContours, 记录contours和pt坐标对应的关系,如[3,9] 代表 该字体有2个contour, 第一个contour 包含前四个坐标点[0,1,2,3], 第二个包含[4,5,6,7,8,9]六个点。所有的偶数位为x,奇数位为y :return: """ fonts_coordinate_matrix = [] # 结果集 for glyphname in self.glyphnames: # 根据name遍历字体文件中的所有字体 if glyphname[0] in ['.', 'g'] or glyphname in self.ignore_names: # 跳过'.notdef', '.null' 'x' continue item = {} glyph = self.font['glyf'][glyphname] item["coordinates"] = glyph.coordinates._a.tolist() item["endPtsOfContours"] = base64.b64encode(str(glyph.endPtsOfContours).encode("utf-8")).decode("utf-8") item["glyphname"] = glyphname fonts_coordinate_matrix.append(item) if DEBGU: logging.debug(msg=fonts_coordinate_matrix) return fonts_coordinate_matrix def ttf_to_image(self): """ 将ttf字体文件的字体绘制在Image对象上 :return: """ glyphset = self.font.getGlyphSet() size = (BASE_BACKGOUND_WIDTH * FONT_NUMS_PER_LINE, ceil(len(self.glyphnames) / FONT_NUMS_PER_LINE) * BASE_BACKGOUND_HEIGHT) # 背景图片尺寸 image = Image.new("RGB", size=size, color=(255, 255, 255)) # 初始化背景图片 name_list, image_dict = [], {} for index, glyphname in enumerate(self.glyphnames): if glyphname[0] in ['.', 'g'] or glyphname in self.ignore_names: # 跳过'.notdef', '.null' continue g = glyphset[glyphname] pen = ReportLabPen(self.glyphnames, Path(fillColor=colors.black, strokeWidth=1)) g.draw(pen) # w, h = g.width, g.width w, h = g.width if g.width > 1000 else 1000, g.width if g.width > 1000 else 1000 g = Group(pen.path) g.translate(0, 200) d = Drawing(w, h) d.add(g) im = renderPM.drawToPIL(d, dpi=72).resize((FONT_WIDTH, FONT_HEIGHT)) box = ( (index % FONT_NUMS_PER_LINE) * BASE_BACKGOUND_WIDTH, (index // FONT_NUMS_PER_LINE) * BASE_BACKGOUND_HEIGHT) image.paste(im, box=box) name_list.append(glyphname) image_dict[glyphname] = im return image, name_list, image_dict @staticmethod def get_orc_faild_font(words, name_list, image_dict): faild_fonts_img = {} for glypname in name_list: if not words.get(glypname): faild_fonts_img[glypname] = image_dict.get(glypname) return faild_fonts_img @staticmethod def avg(alist): return sum(alist) // len(alist)
url = 'https://www.shixiseng.com' + url[0] res = session.get(url).content with open('base.ttf', 'wb') as f: f.write(res) # font_str = re.findall(r";base64,(.*?)'\)", res, re.S)[0] # base_font = make_font_file(font_str, 'base') old = [ '一', '师', '会', '四', '计', '财', '场', '聘', '招', '工', '周', '端', '年', '设', '程', '二', '五', '天', '前', '网', '广', '市', '月', '个', '告', '作', '三', '互', '生', '人', '政', '件', '行', '软', '银', '联', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] new_font = TTFont('base.ttf') new_font.saveXML('base.xml') keys = new_font.getBestCmap() new_map = [] for key in keys: key = hex(key) new_map.append(key) new_map = ['&#x%s' % i[-4:] for i in new_map[1:]] items = [(new_map[i], old[i]) for i in range(len(new_map))] for i in range(len(items)): html = html.replace(items[i][0], items[i][1]) print(html)
class MyFont: def __init__(self, font_path): self.font = TTFont(font_path) self.glyph_set = self.font.getGlyphSet() self.cmap = self.font.getBestCmap() def getGlyph(self, char): """ フォントファイルからグリフ情報を抽出する --- Parameters --- char : ターゲットの1文字 --- Return --- グリフ情報 """ if isinstance(char, int): glyph_name = self.cmap[char] else: glyph_name = self.cmap[ord(char)] return self.glyph_set[glyph_name] def getVectorControl(self, char): """ グリフ情報からベクタ画像用の制御点情報を抽出する --- Parameters --- char : ターゲットの1文字 --- Return --- 制御点情報 """ #recording_pen = RecordingPen() recording_pen = DecomposingRecordingPen(self.glyph_set) obj = self.getGlyph(char) obj.draw(recording_pen) return recording_pen.value def control2Lines(self, control, log=True): """ 抽出した制御点情報をベジェ曲線と直線の集合系に変換する --- Parameters --- control : getVectorControl()で取得した制御点情報 --- Return --- lines : Bezier3, Bezier4, LineSegment の集合 """ path_start_point = None start_point = None lines = [] for val in control: if log: print(val) if val[0] == "closePath": l = PlaneLine([path_start_point, start_point]) lines.append(l) elif val[0] == "moveTo": ## Update close path start point start_point = val[1][0] path_start_point = val[1][0] elif val[0] == "qCurveTo" or val[0] == "curveTo": b = Bezier([start_point] + list(val[1])) lines.append(b) start_point = tuple(b.plist[-1]) elif val[0] == "lineTo": l = PlaneLine([start_point, val[1][0]]) lines.append(l) start_point = val[1][0] elif val[0] == "addComponent": print("[ERROR] 'addComponent' is not implemented!") print("Please use 'DecomposingRecordingPen'") else: print("[ERROR] Unknown command: ", val[0]) return lines def draw(self, char, control_path=False, show=True): """ 指定した文字を描画する --- Parameters --- char : 描画する1文字 control_path : 制御点も描画するかどうか """ ctrl = self.getVectorControl(char) lines = self.control2Lines(ctrl, False) ax = None for line in lines: if isinstance(line, Bezier): ax = line.plot(ax, control_path, resolution=20) elif isinstance(line, PlaneLine): ax = line.plot(ax, linestyle="-", color="black") #ax.set_xlim(0, 2000) #ax.set_ylim(0, 2000) #ax.grid() if show: plt.show() return ax def make_lines(self, base_line, div_num): """ ベースラインを基準にdiv_numで指定した数の放射線集合を作成する --- Parameters --- base_line : 基準となる線(PlaneLineオブジェクト) div_num : 作成する線の数 --- Return --- lines : PlaneLineの集合 """ deg_per = 360 / div_num lines = [] for n in range(div_num): rad = np.deg2rad(deg_per * n) l = base_line.translation().rotate(rad).translation( base_line.plist[0]) lines.append(l) return lines def fetch_distance_vectors(self, char="a", line_num=32, debug=False): """ 文字中心から文字交点までの最大,最小の長さ集合を取得する --- Parameters --- char : 調査する文字 line_num : 使用する放射線の数 --- Return --- selected_point_max : 最大長をとる座標の集合 selected_point_min : 最小長をとる座標の集合 r_max : 最大長集合 r_min : 最小長集合 """ ctrl = self.getVectorControl(char) lines = self.control2Lines(ctrl, log=False) ## 文字の座標範囲をチェック(放射線の長さを決めるのに使用) verts = [p for line in lines for p in line.plist] xs, ys = zip(*verts) min_p = Point(min(xs), min(ys)) max_p = Point(max(xs), max(ys)) if debug: ## 文字を描画(for debug) ax = self.draw(char, show=False) ## 文字の中心座標を取得 gp = Point(min_p.x + (max_p.x - min_p.x) / 2, min_p.y + (max_p.y - min_p.y) / 2) ## 基準線を作成 base_line = PlaneLine([gp.point, (gp.x + max(max_p.x, max_p.y), gp.y)]) radial_lines = self.make_lines(base_line, line_num) if debug: for line in radial_lines: line.plot(ax, color="gray") ## 放射線との交点を調査 points = [] bc = BezierClipping() for line in radial_lines: tmp = [] for l in lines: if isinstance(l, Bezier): result = bc.detect_intersection(l, line) if result == []: continue _, ps = zip(*result) for p in ps: tmp.append(p) elif isinstance(l, PlaneLine): res = l.intersection(line) if res is None: continue #print("line", res) #ax.plot(res[0], res[1], 'o', color="red") #plt.pause(1) tmp.append(res) points.append(tmp) ## 1つの線に対して最大点と最小点だけを抽出 selected_point_max = [] selected_point_min = [] for p, line in zip(points, radial_lines): max_len = 0 min_len = line.length max_p = gp min_p = gp for pp in p: l1 = PlaneLine([gp, pp]) if max_len < l1.length: max_p = pp max_len = l1.length if min_len > l1.length: min_p = pp min_len = l1.length selected_point_max.append(max_p.point) selected_point_min.append(min_p.point) if debug: ## 抽出した最大点と最小点を描画(for debug) xs, ys = zip(*selected_point_max) ax.plot(xs, ys, 'o', color="red") xs, ys = zip(*selected_point_min) ax.plot(xs, ys, '.', color="blue") ## 中心と最大点,最小点の距離集合を作成 r_max = [PlaneLine([gp.point, p]).length for p in selected_point_max] r_min = [PlaneLine([gp.point, p]).length for p in selected_point_min] return selected_point_max, selected_point_min, r_max, r_min
'six': 6, 'seven': 7, 'eight': 8, 'nine': 9 } # 专门用于读取字体 # 1. 视同 ttfont 读取字体 base_font = TTFont('OqcBUBPX.woff') # 2. 把字体文件保存为 xml 格式 base_font.saveXML('font.xml') map_order = base_font.getGlyphOrder() print(map_order) # 获取字体的映射规则(特殊字符->应该显示的字符) map_list = base_font.getBestCmap() print(map_list) # 构建一个可以替换的规则 for key in map_list.keys(): # map_list[key] 取到'period', 然后对'period'重新赋值 # map_str_2_number['period'] 取到. map_list[key] = map_str_2_number[map_list[key]] print(map_list) with open('替换之前的.html', mode='r', encoding='utf-8') as f: text = f.read() for key, value in map_list.items(): text = text.replace('&#' + str(key) + ";", str(value))
def obfuscate_plus(plain_text, filename: str, only_ttf: bool, target_path: str = 'output'): """ :param plain_text: 用户看到的内容 :param filename: 不含格式后缀的文件名 :param only_ttf: 是否需要woff、woff2格式 :param target_path: 生成的目标目录 """ if str_has_whitespace(plain_text): raise Exception('明文不允许含有空格') if str_has_emoji(plain_text): raise Exception('明文不允许含有emoji') plain_text = deduplicate_str(plain_text) original_font = TTFont(root / BASE_FONT_FILE) # https://github.com/fonttools/fonttools/blob/4.0.1/Lib/fontTools/fontBuilder.py#L28 # <class 'dict'>: {32: 'cid00001', 33: 'cid00002', 34: 'cid00003'...} # key 为 ord(字符串) original_cmap: dict = original_font.getBestCmap() try: ensure_cmap_has_all_text(original_cmap, plain_text) except Exception as e: raise e # print('plain_text', plain_text) glyphs, metrics, cmap = {}, {}, {} # Unicode字符平面映射 # https://zh.wikipedia.org/wiki/Unicode%E5%AD%97%E7%AC%A6%E5%B9%B3%E9%9D%A2%E6%98%A0%E5%B0%84 private_codes = random.sample(range(0xE000, 0xF8FF), len(plain_text)) # 中文汉字和常见英文数字等的unicode编码范围实例页面 # https://www.zhangxinxu.com/study/201611/chinese-language-unicode-range.html cjk_codes = random.sample(range(0x4E00, 0x9FA5), len(plain_text)) # print('private_codes', private_codes) # print('cjk_codes', cjk_codes) # https://github.com/fonttools/fonttools/blob/4.0.1/Tests/pens/ttGlyphPen_test.py#L21 glyph_set = original_font.getGlyphSet() pen = TTGlyphPen(glyph_set) glyph_order = original_font.getGlyphOrder() # print('glyph_order', glyph_order) final_shadow_text: list = [] if 'null' in glyph_order: # print('基础字体含有 null') glyph_set['null'].draw(pen) glyphs['null'] = pen.glyph() metrics['null'] = original_font['hmtx']['null'] final_shadow_text += ['null'] if '.notdef' in glyph_order: # print('基础字体含有 .notdef') glyph_set['.notdef'].draw(pen) glyphs['.notdef'] = pen.glyph() metrics['.notdef'] = original_font['hmtx']['.notdef'] final_shadow_text += ['.notdef'] html_entities = [] # 理论上这里还可以再打散一次顺序 for index, plain in enumerate(plain_text): # print('index', index, 'plain', plain) try: shadow_cmap_name = original_cmap[cjk_codes[index]] # print('shadow_cmap_name', shadow_cmap_name) except KeyError: # 遇到基础字库不存在的字会出现这种错误 traceback.print_exc() return obfuscate_plus(filename, plain_text, only_ttf, target_path) final_shadow_text += [shadow_cmap_name] glyph_set[original_cmap[ord(plain)]].draw(pen) glyphs[shadow_cmap_name] = pen.glyph() metrics[shadow_cmap_name] = original_font['hmtx'][original_cmap[ord( plain)]] cmap[private_codes[index]] = shadow_cmap_name html_entities += [hex(private_codes[index]).replace('0x', '&#x')] # print('cmap', cmap) # print('metrics', metrics) # print('final_shadow_text', final_shadow_text) # print('html_entities', html_entities) horizontal_header = { 'ascent': original_font['hhea'].ascent, 'descent': original_font['hhea'].descent, } fb = FontBuilder(original_font['head'].unitsPerEm, isTTF=True) fb.setupGlyphOrder(final_shadow_text) fb.setupCharacterMap(cmap) fb.setupGlyf(glyphs) fb.setupHorizontalMetrics(metrics) fb.setupHorizontalHeader(**horizontal_header) fb.setupNameTable(NAME_STRING) fb.setupOS2() fb.setupPost() fb.save(f'{root}/{target_path}/{filename}.ttf') # print('创建了新字体文件', f'{root}/{target_path}/{filename}.ttf') result = dict() result['ttf'] = f'{root}/{target_path}/{filename}.ttf' if only_ttf: return result else: woff_and_woff2 = subset_ttf_font(f'{root}/{target_path}/{filename}') return { **result, **woff_and_woff2 }, dict(zip(plain_text, html_entities))
def qd_Font_url(font_url): font_response = requests.get(font_url, headers=headers).content font = TTFont(io.BytesIO(font_response)) # 5、获取当前字体映射关系 map_ele_dict = font.getBestCmap() return map_ele_dict
def obfuscate(plain_text, shadow_text, filename: str, only_ttf: bool, target_path: str = 'output') -> dict: """ :param plain_text: 用户看到的内容 :param shadow_text: 爬虫看到的内容 :param filename: 不含格式后缀的文件名 :param only_ttf: 是否需要woff、woff2格式 :param target_path: 生成的目标目录 """ if str_has_whitespace(plain_text) | str_has_whitespace(shadow_text): raise Exception('明文或阴书不允许含有空格') if str_has_emoji(plain_text) | str_has_emoji(shadow_text): raise Exception('明文或阴书不允许含有emoji') plain_text = deduplicate_str(plain_text) shadow_text = deduplicate_str(shadow_text) if plain_text == shadow_text: raise Exception('没有意义的混淆') if len(plain_text) != len(shadow_text): raise Exception('阴书的有效长度需与明文一致') original_font = TTFont(root / BASE_FONT_FILE) # https://github.com/fonttools/fonttools/blob/4.0.1/Lib/fontTools/fontBuilder.py#L28 # <class 'dict'>: {32: 'cid00001', 33: 'cid00002', 34: 'cid00003'...} # key 为 ord(字符串) original_cmap: dict = original_font.getBestCmap() try: ensure_cmap_has_all_text(original_cmap, plain_text) except Exception as e: raise e # print('plain_text', plain_text) # print('shadow_text', shadow_text) glyphs, metrics, cmap = {}, {}, {} # https://github.com/fonttools/fonttools/blob/4.0.1/Tests/pens/ttGlyphPen_test.py#L21 glyph_set = original_font.getGlyphSet() pen = TTGlyphPen(glyph_set) glyph_order = original_font.getGlyphOrder() # print('glyph_order', glyph_order) final_shadow_text: list = [] if 'null' in glyph_order: # print('基础字体含有 null') glyph_set['null'].draw(pen) glyphs['null'] = pen.glyph() metrics['null'] = original_font['hmtx']['null'] final_shadow_text += ['null'] if '.notdef' in glyph_order: # print('基础字体含有 .notdef') glyph_set['.notdef'].draw(pen) glyphs['.notdef'] = pen.glyph() metrics['.notdef'] = original_font['hmtx']['.notdef'] final_shadow_text += ['.notdef'] for index, (plain, shadow) in enumerate(zip(plain_text, shadow_text)): # print('index', index, 'plain', plain, 'shadow', shadow) shadow_cmap_name = original_cmap[ord(shadow)] # print('shadow_cmap_name', shadow_cmap_name) final_shadow_text += [shadow_cmap_name] glyph_set[original_cmap[ord(plain)]].draw(pen) glyphs[shadow_cmap_name] = pen.glyph() metrics[shadow_cmap_name] = original_font['hmtx'][original_cmap[ord( plain)]] cmap[ord(shadow)] = shadow_cmap_name # print('cmap', cmap) # print('metrics', metrics) # print('final_shadow_text', final_shadow_text) horizontal_header = { 'ascent': original_font['hhea'].ascent, 'descent': original_font['hhea'].descent, } fb = FontBuilder(original_font['head'].unitsPerEm, isTTF=True) fb.setupGlyphOrder(final_shadow_text) fb.setupCharacterMap(cmap) fb.setupGlyf(glyphs) fb.setupHorizontalMetrics(metrics) fb.setupHorizontalHeader(**horizontal_header) fb.setupNameTable(NAME_STRING) fb.setupOS2() fb.setupPost() # print('创建了新字体文件', f'{target_path}/{filename}.ttf') fb.save(f'{root}/{target_path}/{filename}.ttf') # print('创建了新字体文件', f'{target_path}/{filename}.ttf') result = dict() result['ttf'] = f'{root}/{target_path}/{filename}.ttf' if not only_ttf: woff_and_woff2 = subset_ttf_font(f'{root}/{target_path}/{filename}') result = {**result, **woff_and_woff2} return result
class TycTTF(): _instance = {} def __init__(self,font_key,url=None,imgSize=(0,0),imgMode='RGB',bg_color=(0,0,0),fg_color=(255,255,255),fontsize=30): self.imgSize = imgSize self.imgMode = imgMode self.fontsize = fontsize self.bg_color = bg_color self.fg_color = fg_color self.font_key = font_key self.url = url or self.make_url self.get_ttl() self.client = AipClient(APP_ID, API_KEY, SECRET_KEY,REDIS_URL) self.r = RedisClient(REDIS_URL) def __new__(cls, url, *args, **kw): ''' 伪单例模式 缓存优化 ''' if url not in cls._instance: cls._instance[url] = super().__new__(cls) return cls._instance[url] @property def make_url(self): return 'https://static.tianyancha.com/fonts-styles/fonts/%s/%s/tyc-num.woff' % (self.font_key[:2],self.font_key) def get_ttl(self): res = requests.get(self.url) # PIL 字体对象 self.font = ImageFont.truetype(BytesIO(res.content),self.fontsize) # ttf字体对象 self.ttf = TTFont(BytesIO(res.content)) # 反向解析 获取字体库所有文字 self.strings = {hex(string).replace('0x','\\u').encode('utf-8').decode('unicode-escape') if string > 2**8 else hex(string).replace('0x','\\x').encode('utf-8').decode('unicode-escape') for string in self.ttf.getBestCmap().keys() } def GenLetterImage(self,letters:str): self.letters = letters (self.letterWidth,self.letterHeight) = self.font.getsize(letters) if self.imgSize==(0,0): # 文字大小基础上 长宽各加10个像素点 self.imgSize=(self.letterWidth+10,self.letterHeight+10) self.imgWidth,self.imgHeight=self.imgSize # new一个image对象 self.img = Image.new(self.imgMode, self.imgSize, self.bg_color) # 画笔对象 self.drawBrush = ImageDraw.Draw(self.img) textY0 = (self.imgHeight-self.letterHeight+1)/2 textY0 = int(textY0) textX0 = int((self.imgWidth-self.letterWidth+1)/2) # 从font对象内获取 letter 映射 文字 并写入空白image对象内 self.drawBrush.text((textX0,textY0), self.letters, fill=self.fg_color,font=self.font) def _orc(self, word:str): # image = pretreat_image(self.img) self.GenLetterImage(word) # 实例化image容器 img = ImageBytes() # 将img bytes 传给image容器 self.img.save(img, 'JPEG') if word in {'0','1','2','3','4','5','6','7','8','9','x'}: # 数字 用eng 解析 kwarg = {'language_type':'ENG'} else: # 其他使用中英文 kwarg = {'language_type':'CHN_ENG'} return self.client.run(img.img,self.font_key,word,**kwarg) def orc(self,word:str): if self.r.hexists(self.url, word): return self.r.hget(self.font_key, word).decode('utf-8') else: return self._orc(word) def run(self, word:str): string = '' for letter in word: if letter in self.strings: string += self.orc(letter) else: string += letter return string
def parse(self, response): font_url = 'http:' + re.search(r"url\('(.*\.woff)'\)", response.text).group(1) with open('on_maoyan.woff', 'wb') as f: f.write(requests.get(font_url).content) base_font = TTFont('base_maoyan.woff') # 获取基础字体对象 # base_font.saveXML('base_maoyan.xml') base_uni = base_font.getGlyphOrder()[2:] # 获取基础字体编码,从第二个开始 print('base_uni:', base_uni) base_obj = base_font.getGlyphNames()[1:-1] #获取基础字体字符对象 print('base_obj:', base_obj) base_dict = { 'uniE5A1': '9', 'uniF2B5': '5', 'uniE3BD': '8', 'uniF48F': '1', 'uniE6B8': '0', 'uniF03F': '2', 'uniEFB6': '6', 'uniF7EF': '7', 'uniF822': '3', 'uniF14B': '4' } online_font = TTFont('on_maoyan.woff') # 获取动态字体对象 online_font.saveXML('on_maoyan.xml') # 将动态字体转成xml格式 查看结构 on_name = online_font.getBestCmap() print('on_name:', on_name) online_uni = online_font.getGlyphOrder()[2:] print('online_uni:', online_uni) online_obj = online_font.getGlyphNames()[1:-1] print('online_obj:', online_obj) selector = etree.HTML(response.body.decode('utf-8')) node_list = selector.xpath('//*[@class="board-item-content"]') print(node_list) for node in node_list: print(node) item = {} item['title'] = node.xpath('.//p[@class="name"]/a/text()')[0] rt = node.xpath('.//p[@class="realtime"]/span/span/text()')[0] print('rt:', rt) print(etree.tostring(node)) i = etree.tostring(node) a = node.xpath('.//p[@class="realtime"]/span/span/text()') print('a:', a) print(type(a[0])) print(type(i)) print('a:', bytes(a[0], encoding="utf8").decode('unicode-escape')) print('a:', a[0].encode('utf-8').decode('utf-8')) print('a[0]:', b'a'[0]) print('a[0]:', b'a[0]'.decode('utf-8')) b = re.findall(b'span class="stonefont">(.*?)</span>', i)[0].decode('utf-8') print(b) b = re.sub('&#', '', b) b = re.sub('\.', '.;', b) b = b.split(';') item['p'] = [] for i in b: if i != '': if i == '.': item['p'].append(i) else: item['p'].append( pojie(online_font, i, base_dict, base_uni, base_font)) item['p'] = ''.join(item['p']) yield item