def process(infile, outfile, layer): font = TTFont(infile) glyf = font["glyf"] glyphNamesToKeep = [] if layer == "letters": for glyphName in font.getGlyphNames(): if glyphName not in _LAYER2_GLYPHS and glyphName not in _LAYER3_GLYPHS: glyphNamesToKeep.append(glyphName) elif layer == "diacritics": glyphNamesToKeep = _LAYER2_GLYPHS elif layer == "quranic-signs": glyphNamesToKeep = _LAYER3_GLYPHS for glyphName in font.getGlyphNames(): if glyphName not in glyphNamesToKeep: glyph = glyf[glyphName] if glyphName in ("uni0670.medi", "uni06E5.medi", "uni06E6.medi") and layer == "letters": # We want to keep the kashida part of those glyphs. components = [] for component in glyph.components: if component.glyphName != glyphName.split(".")[0]: components.append(component) glyph.components = components else: # This will cause FontTools not to output any outlines for that # glyph. glyph.numberOfContours = 0 font.save(outfile)
def read_metadata(font): ttf = TTFont(font, lazy=True) try: ttf.getGlyphNames() except Exception: logging.error('Not a vaild font: ' + request['url']) return None reader = ttf.reader metadata = { 'table_sizes': {tag: reader.tables[tag].length for tag in sorted(reader.keys())}, 'names': _read_names(ttf, (_NAME_ID_VERSION, _NAME_ID_POSTSCRIPT_NAME, _NAME_ID_LICENSE_URL)), 'OS2': _read_os2(ttf), 'post': _read_post(ttf), 'fvar': _read_fvar(ttf), 'counts': _read_codepoint_glyph_counts(ttf), } ttf.close() return {k: v for k, v in metadata.items() if v is not None}
def process_font(url): # loc.woff是事先下载好的字体文件 # 可以通过font1.saveXML()来了解文件的结构, font1就像一个的字典, XML文件里的tag可以对font1用字典的方法获取 font1 = TTFont('loc.woff') # 使用百度的FontEditor手动确认本地字体文件name和数字之间的对应关系, 保存到字典中 loc_dict = { 'uniE8B2': '5', 'uniF818': '3', 'uniECCC': '8', 'uniE622': '1', 'uniEC92': '2', 'uniF31A': '4', 'uniE86D': '9', 'uniE33C': '6', 'uniE1FA': '7', 'uniE13E': '0' } # 获取字符的name列表, 打印出来后发现第一个和最后一个name所对应的不是数字, 所以切片 uni_list1 = font1.getGlyphNames()[1:-1] # 网页源码 rsp = urlopen(url).read().decode() # 获取动态的字体文件并下载 font_url = 'http://' + re.findall(r'url\(\'//(.*?\.woff)', rsp)[0] # web字体文件落地名 filename = font_url.split('/')[-1] # 下载web字体文件 urlretrieve(font_url, filename) # 打开web字体文件 font2 = TTFont(filename) # 获取字符的name列表 uni_list2 = font2.getGlyphNames()[1:-1] # web字体文件中name和num映射 new_map = {} for uni2 in uni_list2: # 获取name 'uni2' 在font2中对应的对象 obj2 = font2['glyf'][uni2] for uni1 in uni_list1: # 获取name 'uni1' 在font1中对应的对象 obj1 = font1['glyf'][uni1] # 如果两个对象相等, 说明对应的数字一样 if obj1 == obj2: # 将name键num值对加入new_map new_map[uni2] = loc_dict[uni1] # 将数字替换至源码 for i in uni_list2: pattern = '&#x' + i[3:].lower() + ';' rsp = re.sub(pattern, new_map[i], rsp) # 返回处理处理后的源码 return rsp
def __init__(self, bad_font_file, experiment_dir, src_fonts_dir='charset/ZhongHuaSong', fonts_json='/disks/sdb/projs/AncientBooks/data/chars/font_missing.json', fonts_root=None, type_fonts='type/宋黑类字符集.txt', input_nc=1, embedding_num=250, embedding_dim=128, # model settings Lconst_penalty=15, Lcategory_penalty=1.0, gpu_ids=['cuda'], resume=240000, # model settings char_size=250, canvas_size=256, fake_prob=0.03): fontPlane00 = TTFont(os.path.join(src_fonts_dir, 'FZSONG_ZhongHuaSongPlane00_2020051520200519101119.TTF')) fontPlane02 = TTFont(os.path.join(src_fonts_dir, 'FZSONG_ZhongHuaSongPlane02_2020051520200519101142.TTF')) self.charSetPlane00 = processGlyphNames(fontPlane00.getGlyphNames()) self.charSetPlane02 = processGlyphNames(fontPlane02.getGlyphNames()) self.charSetTotal = self.charSetPlane00 | self.charSetPlane02 self.charListTotal = list(self.charSetTotal) self.char_size = char_size self.canvas_size = canvas_size self.fake_prob = fake_prob self.fontPlane00 = ImageFont.truetype( os.path.join(src_fonts_dir, 'FZSONG_ZhongHuaSongPlane00_2020051520200519101119.TTF'), char_size) self.fontPlane02 = ImageFont.truetype( os.path.join(src_fonts_dir, 'FZSONG_ZhongHuaSongPlane02_2020051520200519101142.TTF'), char_size) self.fonts = self.get_fonts(fonts_json) self.fonts_root = fonts_root self.fonts2idx = {os.path.splitext(font['font_name'])[0]: idx for idx, font in enumerate(self.fonts)} with open(type_fonts, 'r', encoding='utf-8') as fp: self.type_fonts = {idx: font_line.strip() for idx, font_line in enumerate(fp)} self.type_fonts_rev = {v: k for k, v in self.type_fonts.items()} if bad_font_file: with open(bad_font_file, 'r', encoding='utf-8') as fp: self.bad_font_ids = [int(_) for _ in fp.readline().strip().split()] else: self.bad_font_ids = [] self.fake_prob = 0.05 checkpoint_dir = os.path.join(experiment_dir, "checkpoint") self.model = FontMagicModel( input_nc=input_nc, embedding_num=embedding_num, embedding_dim=embedding_dim, Lconst_penalty=Lconst_penalty, Lcategory_penalty=Lcategory_penalty, save_dir=checkpoint_dir, gpu_ids=gpu_ids, is_training=False ) self.model.setup() self.model.print_networks(True) self.model.load_networks(resume)
def tran(self, text, html): url = re.findall("url\('(.*?.woff)'", html)[0] with open('人人车01.ttf', 'wb') as f: f.write(requests.get(url=url).content) font1 = TTFont('人人车.ttf') obj_list1 = font1.getGlyphNames()[1:] # 获取所有字符的对象,去除第一个和最后一个 uni_list1 = font1.getGlyphOrder()[1:] font2 = TTFont('人人车01.ttf') obj_list2 = font2.getGlyphNames()[1:] # 获取所有字符的对象,去除第一个和最后一个 uni_list2 = font2.getGlyphOrder()[1:] dict = { 'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9' } dict1 = { 'zero': '0', 'one': '1', 'two': '2', 'four': '3', 'three': '4', 'five': '5', 'seven': '6', 'nine': '7', 'six': '8', 'eight': '9' } ''' 遍历加密的内容text,在新的ttf文件中查找每一个text的元素。如果找到,则替换''' for a in text: for uni2 in uni_list2: # print(uni2) try: id = dict[str(uni2)] # 找到unit2未加密对应的数字 except: continue id_1 = font2.getGlyphID(str(uni2)) # Z找到unit2在ttf文件中的id obj2 = font2['glyf'][uni2] # str(id) != str(id_1): # 若未加密的数字id和ttf中对应的id_1不相等,说明a加密了 if str(id) == str(a): for uni1 in uni_list1: obj1 = font1['glyf'][uni1] if obj1 == obj2: text = text.replace(a, dict1[uni1]) return text
def get_movie_ticket(self, html, flag=False): p = re.compile(r"url\('(.*?)'\) format\('woff'\);") uni_font_url = re.findall(p, html) url = 'http:%s' % uni_font_url[0] # print("字体url:" + url) resp = requests.get(url) with open('maoyan.woff', 'wb') as fontfile: fontfile.write(resp.content) baseFonts = TTFont('basefont.woff') # 这个文件是保存在本地的, 需要手动解析一个字体库, 作为不变的部分 base_nums = ['4', '1', '3', '0', '5', '6', '7', '9', '2', '8'] # 基本的数字表 base_fonts = ['uniF66E', 'uniE944', 'uniE4BE', 'uniEF0F', 'uniEF8D', 'uniE963', 'uniE142', 'uniE023', 'uniE995', 'uniF3A0'] # 基本的映射表 onlineFonts = TTFont('maoyan.woff') # 网络上下载的动态的字体文件 uni_list = onlineFonts.getGlyphNames()[1:-1] # 只有中间的部分是数字 temp = {} # 解析字体库 for i in range(10): onlineGlyph = onlineFonts['glyf'][uni_list[i]] # 返回的是unicode对应信息的对象 for j in range(10): baseGlyph = baseFonts['glyf'][base_fonts[j]] if onlineGlyph == baseGlyph: temp[uni_list[i].replace('uni', '0x').lower()] = base_nums[j] # print(temp) for key in temp.keys(): initstr = key + ';' html = html.replace(initstr, str(temp[key])) if flag: return html else: return self.find_ticket(html)
def parse_fonts(content): """ :param filepath: 请求ttf地址的响应 :return: 字体字典 """ font = TTFont(BytesIO(content)) glyphnames = font.getGlyphNames() unknown_list, fonts = [], {} for glyphname in glyphnames[1:]: item = {} glyph = font['glyf'][glyphname] item["cp"] = glyph.endPtsOfContours item["glyphname"] = glyphname if item['cp'] == [11]: item['xy'] = glyph.coordinates[0] unknown_list.append(item) for font in fonts_list: for dom in unknown_list: if dom.get("cp") == font.get("cp") and dom.get("cp") != [12]: fonts[dom['glyphname'][3:]] = font.get("value") else: if dom.get("cp") == [12]: if int(dom.get("xy")[0][1]) > 200: fonts[dom['glyphname'][3:]] = "十" else: fonts[dom['glyphname'][3:]] = "上" return fonts
def extract_all_characters(self, woff_file): ttfont = TTFont(woff_file) glyph_names = ttfont.getGlyphNames() glyphs = ttfont['glyf'] font_map = dict() # 提取字符 # 协程池 # glyph_names.remove('glyph00000') # glyph_names.remove('x') # coroutine_list = [self.pool.spawn(self.extract_single_character, glyph_name, glyphs) for glyph_name in # glyph_names] # gevent.joinall(coroutine_list) # for coroutine in coroutine_list: # font_map.update(coroutine.value) # del coroutine_list # 线程池 glyph_names.remove('glyph00000') glyph_names.remove('x') thread_list = [] with ThreadPoolExecutor() as pool: for glyph_name in glyph_names: thread = pool.submit(self.extract_single_character, glyph_name, glyphs) thread_list.append(thread) for future in as_completed(thread_list): font_map.update(future.result()) return font_map
def _create_font_mapping(self, base_font: TTFont, base_font_mapping: dict, content): font_file_path = './temp.woff' with open(font_file_path, 'wb') as font_file: font_file.write(content) self.mapping = {} online_font = TTFont(font_file_path) uni_list = online_font.getGlyphNames() online_data_count = len(uni_list) base_font_keys = list(base_font_mapping.keys()) base_data_count = len(base_font_keys) for i in range(online_data_count): online_glyph = online_font['glyf'][uni_list[i]] for j in range(base_data_count): base_glyph = base_font['glyf'][base_font_keys[j]] if online_glyph == base_glyph: if uni_list[i] in base_font_mapping: key = f'"\\u{uni_list[i][3:]}"' key = json.loads(key) self.mapping[key] = base_font_mapping[ base_font_keys[j]]
async def shouldParseFont(self, bodyClass): # self.resetCookie() print(bodyClass) file = self._fontCachedPath % (bodyClass) if not os.path.exists(file): #self.resetCookie() async with aiohttp.ClientSession() as session: async with session.get(self._fontUrl % (bodyClass[0:2], bodyClass), headers=self._headers) as _resp: assert _resp.status == 200 with open(file, 'wb') as fd: while True: chunk = await _resp.content.read(1024) if not chunk: break fd.write(chunk) font = TTFont(file) #font.saveXML("./fonts/1.xml") gly_list = font.getGlyphOrder() gly_names = font.getGlyphNames() gly_list = gly_list[2:12] gly_names = gly_names[0:10] secrets = {} for i in range(10): secrets[gly_list[i]] = gly_names[i] self._redis.set(bodyClass, json.dumps(secrets)) return secrets else: return json.loads(self._redis.get(bodyClass))
def ocr_processor(filename): """ Args: filename:上传的字体文件的文件名 remote_addr: file_suffix: has_pic_detail: Returns: """ ocr_results = [] f = TTFont(filename) ProgressBar.max_length = len(f.getGlyphNames()) for i, name in f.getBestCmap().items(): pil = uni_2_png_stream(i, filename, 100) buffered = BytesIO() pil.save(buffered, format="PNG") ocr_results.append({ 'name': name, 'img': 'data:image/png;base64,' + base64.b64encode(buffered.getvalue()).decode(), 'ocr_result': tesseract_single_character(pil) }) SocketQueue.res_queue.put(name) return ocr_results
def findstar(): words = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新' \ '限天面工服海华水房饰城乐汽香部利籽老艺花专东肉菜学福饭人百餐茶务' \ '通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理' \ '锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站' \ '德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育' \ '宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校' \ '鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加' \ '麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运' \ '都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四' \ '多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街' \ '交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下' \ '県凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎' \ '第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临' \ '弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗' \ '张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常' \ '真们但最喜哈么别位能较镜非为欢然他挺着价那意种想出员两推做排' \ '实分间甜度起满给热完格荐喝等其再几只现朋侯样直而买于般豆量选' \ '奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾' \ '如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚' \ '午接重串回晚微周值费性桌拍跟块调糕.' print(len(words)) words_list = [] for word in words: words_list.append(word) # print(words_list) data = [] new_font = [] xmlfilepath_temp = os.path.abspath("to.xml") domobj_temp = xmldom.parse(xmlfilepath_temp) elementobj_temp = domobj_temp.documentElement subElementObj = elementobj_temp.getElementsByTagName("TTGlyph") for i in range(1, len(subElementObj)): rereobj = re.compile(r"name=\"(.*)\"") find_list = rereobj.findall(str(subElementObj[i].toprettyxml())) data.append(str(subElementObj[i].toprettyxml())) # 根据字体模板解码本次请求下载的字体 xmlfilepath_find = os.path.abspath("to2333.xml") domobj_find = xmldom.parse(xmlfilepath_find) elementobj_find = domobj_find.documentElement tunicode = elementobj_find.getElementsByTagName("TTGlyph") for i in range(1, len(tunicode)): th = tunicode[i].toprettyxml() report = re.compile(r"name=\"(.*)\"") find_this = report.findall(th) get_code = th for j in range(len(data)): if get_code == data[j]: new_font.append(words_list[j]) font = TTFont("demo.woff") font_list = font.getGlyphNames() font_list.remove('glyph00000') for i in range(len(font_list)): font_list[i] = str(font_list[i]).lower().replace("uni", '') return (new_font, font_list)
class ItemListController(QObject): def __init__(self, parent): super(ItemListController, self).__init__() self._parent = parent self._items = [] self.ttf = None self.font_name = None self.font_extension = None self.xml_file = None self.xml_out_file = None self.output_dir = None self._init_table() def load_file(self, filename): split_file = os.path.basename(filename).split('.') self.font_name = '.'.join(split_file[:-1]) self.font_extension = split_file[-1] self.ttf = TTFont(filename) self._load_items() def _init_table(self): self.table = self._parent.ui.item_table self.table_model = LigatureTableModel([], ['Name', 'Ligature'], self.table) self.table.setModel(self.table_model) self.table.setSortingEnabled(True) def _load_items(self): self.table_model.clear() for name in self.ttf.getGlyphNames(): self.table_model.add(LigatureItem(name, '')) self.table_model.restore_mapping() @pyqtSlot() def save(self): if not self.output_dir: self._parent.log('no dir!') else: self.save_to_dir(self.output_dir) def save_to_dir(self, directory): try: mapping = self.table_model.get_mapping() processor = FontProcessor(self.ttf, mapping) processor.save_files(directory, self.font_name) self._parent.log('OK!') except ReferenceError as e: self._parent.log(e)
def get_font_map(): """ 获取code和数字的映射表 """ font1 = TTFont('base.woff') #font1.saveXML('font_base.xml') base_dict = { 'glyph00009': 7, 'glyph00013': 2, 'glyph00018': 1, 'glyph00023': 6, 'glyph00028': 9, 'glyph00030': 8, 'glyph00034': 4, 'glyph00039': 5, 'glyph00044': 3, 'glyph00048': 0 } name_list1 = font1.getGlyphNames() font2 = TTFont('a.woff') #font2.saveXML('font_1.xml') name_list2 = font2.getGlyphNames() new_name_list1 = get_new_name_list(font1, name_list1) new_name_list2 = get_new_name_list(font2, name_list2) print(new_name_list1) print(new_name_list2) # 获取name与数字的映射关系 new_dict = {} for name2 in new_name_list2: coord_list2 = font2['glyf'][name2].coordinates for name1 in new_name_list1: coord_list1 = font1['glyf'][name1].coordinates if coord_list1[:10] == coord_list2[:10]: new_dict[name2] = base_dict[name1] print(new_dict) font_map = {} # 使用getBestCmap方法来获取name和code的映射关系 for key, value in font2.getBestCmap().items(): if value in new_dict.keys(): font_map[hex(key)] = new_dict[value] print(font_map) return font_map
def job(): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/66.0.3359.139 Safari/537.36 " } index_url = 'http://maoyan.com/' # 获取首页内容 response_index = requests.get(index_url, headers=headers).text index_xml = etree.HTML(response_index) info_list = index_xml.xpath( '//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()' ) a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4]) print(a) # 获取字体文件的url woff_ = re.search(r"url\('(.*\.woff)'\)", response_index).group(1) woff_url = 'http:' + woff_ response_woff = requests.get(woff_url, headers=headers).content with open('fonts.woff', 'wb') as f: f.write(response_woff) # base_nums, base_fonts 需要自己手动解析映射关系, 要和basefonts.woff一致 baseFonts = TTFont('basefonts.woff') base_nums = ['7', '9', '0', '3', '6', '5', '2', '1', '4', '8'] base_fonts = [ 'uniF04C', 'uniE374', 'uniF426', 'uniEAAA', 'uniF519', 'uniEEC4', 'uniF543', 'uniF7C7', 'uniF046', 'uniF08E' ] onlineFonts = TTFont('fonts.woff') # onlineFonts.saveXML('test.xml') uni_list = onlineFonts.getGlyphNames()[1:-1] temp = {} # 解析字体库 for i in range(10): onlineGlyph = onlineFonts['glyf'][uni_list[i]] for j in range(10): baseGlyph = baseFonts['glyf'][base_fonts[j]] if onlineGlyph == baseGlyph: temp["&#x" + uni_list[i][3:].lower() + ';'] = base_nums[j] # 字符替换 pat = '(' + '|'.join(temp.keys()) + ')' response_index = re.sub(pat, lambda x: temp[x.group()], response_index) # 内容提取 index_xml = etree.HTML(response_index) info_list = index_xml.xpath( '//*[@id="app"]/div/div[1]/div[1]/div/div[2]/ul/li[1]/a/div[2]/div//text()' ) a = u'电影名称:%s, 票房总数:%s' % (info_list[1], info_list[4]) print(a)
def get_font_dict(font_face): ''' 返回字符编码对应的数字 ''' # 对页面上的font_face进行解码并保存字体文件 font_code = base64.b64decode(font_face) with open('002.ttf', 'wb') as f: f.write(font_code) # 打开字体文件,001.ttf为基准比对文件,002.ttf为本次页面中的字体文件 font1 = TTFont('001.ttf') font2 = TTFont('002.ttf') # 获取字体文件中uin编码的列表,去掉头尾 # font1.getGlyphOrder() 也能获取,但顺序不一样 uni_list_1 = font1.getGlyphNames()[1:-1] uni_list_2 = font2.getGlyphNames()[1:-1] # 手动匹配001.ttf中编码对应的数字 font_dict_1 = { 'uniE035': 5, 'uniE285': 0, 'uniE8D4': 9, 'uniED7F': 7, 'uniF11E': 8, 'uniF137': 2, 'uniF1BF': 4, 'uniF4EC': 1, 'uniF59C': 3, 'uniF750': 6 } # 用来存放本次页面中编码和数字的对照字典 font_dict_2 = {} # 根据字体对象是否一致进行匹配 for uni_code2 in uni_list_2: for uni_code1 in font_dict_1: if font2['glyf'][uni_code2] == font1['glyf'][uni_code1]: font_dict_2[uni_code2] = font_dict_1[uni_code1] return font_dict_2
def _create_base_mapping(self, base_font_file: str, base_font_mapping_file: str, type_: str) -> None: font = TTFont(base_font_file) uni_list = font.getGlyphNames() logger.info(f'There is {len(uni_list)} fonts in {base_font_file}') with open(base_font_mapping_file, 'r') as ifile: mapping = json.load(ifile) for key, value in mapping.items(): glyph = font['glyf'][key] key = eval(r"u'\u" + key[3:] + "'") self.base_glyph_mapping[type_][key] = glyph self.base_str_mapping[type_][key] = value
def get_base_map(): """ 生成手动映射关系 :return: """ data = {} font = TTFont('base.ttf') res = font.getGlyphNames() for item in res: if 'uni' in item: data[item] = '' with open('base.map.json', 'w', encoding='utf-8') as f: json.dump(data, f, indent=4)
def parse_font(self, value): # 解析字体还原数据 fontfile = '%s/%s' % (self.temp, 'tyc-num.woff') if not os.path.exists(fontfile): return None font = TTFont(fontfile) source_num = font.getGlyphOrder()[2:10] target_num = font.getGlyphNames()[:8] value = list(''.join(value)) for i, v in enumerate(value): if v in source_num: index = source_num.index(v) value[i] = target_num[index] return ''.join(value)
def listFontGlyphNames(fontName): path = fontPath(fontName) if path is None: return [] try: fontToolsFont = TTFont(path, lazy=True, fontNumber=0) except TTLibError: # warn if fontTools cannot read the file return [] characters = [] glyphNames = fontToolsFont.getGlyphNames() fontToolsFont.close() if ".notdef" in glyphNames: glyphNames.remove(".notdef") return glyphNames
def decrypt_font(url,headers): ''' 输入:链接和头部信息 输出:返回解决字体反爬后的页面源码 ''' font1=TTFont('./fonts/base.woff') # 使用百度的FontEditor找到本地字体文件name和数字之间的对应关系, 保存到字典中 base_dict={'uniE18E': '3', 'uniE585': '2', 'uniE194': '9', 'uniF439': '4', 'uniE7DB': '7','uniF115': '0', 'uniF0A4': '5', 'uniE311': '1', 'uniF7EF': '8', 'uniEACB': '6'} name_list1=font1.getGlyphNames()[1:-1] response=requests.get(url,headers).text # 正则匹配字体woff文件 font_file=re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', response)[0] url2='http://vfile.meituan.net/colorstone/' + font_file new_file=requests.get(url2,headers) with open('./fonts/'+font_file,'wb') as f: f.write(new_file.content) font2=TTFont('./fonts/'+font_file) font2.saveXML('font_2.xml') name_list2=font2.getGlyphNames()[1:-1] # 构造新映射 new_dict={} for name2 in name_list2: obj2=font2['glyf'][name2] for name1 in name_list1: obj1=font1['glyf'][name1] # 对象相等则说明对应的数字相同 if obj1==obj2: new_dict[name2]=base_dict[name1] for i in name_list2: pattern='&#x'+i[3:].lower()+';' response=re.sub(pattern,new_dict[i],response) return response
def single_font_to_pic(filename, content): res_list = [] pils = [] with open(filename, 'wb') as f: f.write(content) font = TTFont('./' + filename) for glyph in font.getGlyphNames(): if glyph.isdigit(): pils.append(generate_pic(glyph, font, 30, 0.04)) res = ocr_func_for_digit(pils) for idx, foo in enumerate(res): res_list.append({ "ocr_result": foo, "name": str(font.getGlyphNames()[idx]).replace('.png', '').replace('_', ''), }) return res_list
def handle_fonts(self, url): print('downloading {}'.format(url)) r = requests.get('http:' + url) with open("./static/damn.woff", "wb") as code: code.write(r.content) font = TTFont("./static/damn.woff") font.saveXML('./static/damnTo.xml') # 加载字体模板 num = [8, 6, 2, 1, 4, 3, 0, 9, 5, 7] data = [] new_font = [] _xml_file_path = os.path.abspath("./static/temp.xml") _dom = xmldom.parse(_xml_file_path) _ele = _dom.documentElement # 标签中是这些数字的具体坐标画法,一个TTGlyph对应一个数字 # 其中contour标签的坐标数据,就是唯一确定数字的方法 _TTGlyphs = _ele.getElementsByTagName("TTGlyph") for i in range(len(_TTGlyphs)): th = _TTGlyphs[i].toprettyxml() _pattern = re.compile(r"name=\"(.*)\"") _found = _pattern.findall(str(th)) data.append(str(th).replace(_found[0], '').replace("\n", '')) # 根据字体模板解码本次请求下载的字体 _down_xml = os.path.abspath("./static/damnTo.xml") _new_dom = xmldom.parse(_down_xml) _new_ele = _new_dom.documentElement _new_TTGlyphs = _new_ele.getElementsByTagName("TTGlyph") for i in range(len(_new_TTGlyphs)): th = _new_TTGlyphs[i].toprettyxml() _pattern = re.compile(r"name=\"(.*)\"") _found = _pattern.findall(th) get_code = th.replace(_found[0], '').replace("\n", '') for j in range(len(data)): if get_code == data[j]: new_font.append(num[j]) ans = {} font = TTFont("./static/damn.woff") font_list = font.getGlyphNames() font_list.remove('glyph00000') font_list.remove('x') for i in range(len(font_list)): font_list[i] = str(font_list[i]).lower().replace("uni", '&#x') + ';' ans[font_list[i]] = new_font[i] return ans
def __init__(self, ttf_path, default_ttf_path, char_size, canvas_size): self.ttf_path_list = [] for ttf_file in os.listdir(ttf_path): if os.path.splitext(ttf_file)[-1].lower() in ['.ttf', '.otf', '.ttc']: self.ttf_path_list.append(os.path.join(ttf_path, ttf_file)) self.default_ttf_path = [] for ttf_file in os.listdir(default_ttf_path): if os.path.splitext(ttf_file)[-1].lower() in ['.ttf', '.otf', '.ttc']: self.default_ttf_path.append(os.path.join(default_ttf_path, ttf_file)) self.default_ttf_charset = [] for default_ttf in self.default_ttf_path: ttfont = TTFont(default_ttf) self.default_ttf_charset.append(processGlyphNames(ttfont.getGlyphNames())) self.char_size = char_size self.canvas_size = canvas_size
def font_creator(html): """ 这个函数是用来处理动态数字加载问题 :param html: :return:饭后的是处理之后,带有正确数字的html代码 """ headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } # 用正在表达式匹配后缀为woff的url woff_name = re.search( r"url\('//vfile.meituan.net/colorstone/(.*\.woff)'\)", html).group(1) # 判断文件是否存在,不存在再下载 file_list = os.listdir('./fonts') if woff_name not in file_list: woff_url = 'http://vfile.meituan.net/colorstone/' + woff_name response_woff = requests.get(woff_url, headers=headers).content with open('./fonts/' + woff_name, 'wb') as f: f.write(response_woff) # 手动解析一组basefonts.woff的映射 baseFonts = TTFont('./fonts/basefonts.woff') base_nums = ['9', '4', '2', '1', '3', '7', '8', '0', '6', '5'] base_fonts = [ 'uniECE2', 'uniF284', 'uniF5F6', 'uniE3CA', 'uniF798', 'uniF7E7', 'uniF020', 'uniE4A7', 'uniF4B5', 'uniE0FC' ] # 调用在线下载的 onlineFonts = TTFont('./fonts/' + woff_name) uni_list = onlineFonts.getGlyphNames()[1:-1] temp = {} # 解析字体库,通过我们给出的,和新下载的做比对 for i in range(10): onlineGlyph = onlineFonts['glyf'][uni_list[i]] for j in range(10): baseGlyph = baseFonts['glyf'][base_fonts[j]] if onlineGlyph == baseGlyph: temp["&#x" + uni_list[i][3:].lower() + ';'] = base_nums[j] # 字符替换 pat = '(' + '|'.join(temp.keys()) + ')' html = re.sub(pat, lambda x: temp[x.group()], html) # 返回是正确数字的html源码 return html
def fonts(response_index): try: # 获取字体文件的url woff_ = re.search(r"url\('(.*\.woff)'\)", response_index).group(1) # print(woff_) woff_url = 'http:' + woff_ response_woff = requests.get(woff_url, headers=headers).content # 将字体文件保存到本地, 每次爬取都需要保存 with open('fonts.woff', 'wb') as f: f.write(response_woff) # baseFonts: 从网站的源代码的font-face中的url下载woff文件 并改名为basefonts.woff baseFonts = TTFont('basefonts.woff') # 用http://fontstore.baidu.com/static/editor/index.html#解析basefonts.woff文件 # base_nums, base_fonts 需要自己手动解析映射关系, 要和basefonts.woff一致 base_nums = ['9', '5', '6', '7', '3', '8', '4', '2', '1', '0'] base_fonts = [ 'uniF59C', 'uniF65B', 'uniE3C2', 'uniECD9', 'uniE676', 'uniF7AD', 'uniF4B7', 'uniF7F7', 'uniE683', 'uniF044' ] # onlineFonts: 从get中解析出font-face的url, 并以二进制写入fonts.woff文件中 onlineFonts = TTFont('fonts.woff') # onlineFonts.saveXML('test.xml') # 获取数字的编码 uni_list = onlineFonts.getGlyphNames()[1:-1] temp = {} # 解析字体库 for i in range(10): # 获取fonts.woff中的第i个信息 onlineGlyph = onlineFonts['glyf'][uni_list[i]] for j in range(10): # 获取basefonts.woff中的第j个信息 baseGlyph = baseFonts['glyf'][base_fonts[j]] # 如果fonts.woff的第i个信息与basefonts.woff的第j个信息相同, 就保存在temp中 if onlineGlyph == baseGlyph: # 键为f&@x加onts.woff的第i个小写信息, 值为basefonts.woff的第j个信息 temp["&#x" + uni_list[i][3:].lower() + ';'] = base_nums[j] # print(temp) # 字符替换 pat = '(' + '|'.join(temp.keys()) + ')' response_index = re.sub(pat, lambda x: temp[x.group()], response_index) response = etree.HTML(response_index) return response except: print('解析失败!')
def parse(self, response): item = {} # 获得字体url style = ''.join(response.xpath('//style').extract()) font_url = 'http:' + ''.join(re.findall("url\('(.*?woff)'\)", style)) # 字体文件下载 font_response = requests.get(url=font_url) with open('on_maoyan.woff', 'wb') as f: f.write(font_response.content) # 获得的字体转为xml格式 tfont = TTFont('base_maoyan.woff') # 字体0101形状 tfont['glyf']['name'] tfont.saveXML('shilie_maoyan.xml') # 参照的字体 basefont = TTFont('on_maoyan.woff') # 作为[参照的] 所有数字的键 base_numlist = basefont.getGlyphNames()[1:-1] # 作为参照的maoyanBase字体文件的映射关系 font_guanxi = { 'uniE877': '5', 'uniF0B5': '0', 'uniE3C8': '6', 'uniF076': '3', 'uniF833': '8', 'uniF079': '2', 'uniECED': '9', 'uniE49B': '1', 'uniEB89': '4', 'uniE56F': '7', } # 解密的字典 # 获取信息的节点 node = etree.HTML(response.text) node = node.xpath('//div[@class="board-item-content"]') for i in node: item['title'] = ''.join(i.xpath('.//p[@class="name"]/a/text()')) piao = etree.tostring(i) piao = re.findall(b'<span class="stonefont">(.*?)</span>', piao)[0] print(piao) piao = re.sub(b'&#', b'', piao).decode('utf-8') piao = re.sub('\.', '.;', piao) piao = piao.split(';') print(piao) item['piao'] = piao item['piao'] = [] for p in piao: if p != '': if p=='.': item['piao'].append('.') else: item['piao'].append(jiemi(tfont, p, basefont, font_guanxi)) item['piao']=''.join(item['piao']) yield item
def main(): args = parser.parse_args() for font_path in args.fonts: glyphWidths = {} print("\n-----------------------------------------\n") print(font_path) ttfont = TTFont(font_path) print(type(ttfont["cmap"].getBestCmap())) unicodesDict = {} # key is the glyph number # cmap[key] is the glyph name for key in ttfont["cmap"].getBestCmap().keys(): print(key, ttfont["cmap"].getBestCmap()[key]) glyphName = ttfont["cmap"].getBestCmap()[key] # unicodesDict[glyphName] = '%04x' % key unicodesDict[glyphName] = key print('unicodesDict') print(unicodesDict) for glyphName in ttfont.getGlyphNames(): if ttfont['hmtx'][glyphName][0] == 600: print(".") else: # print(glyphName) # print(ttfont['hmtx'][glyphName][0]) if glyphName in unicodesDict.keys(): glyphUnicode = str(unicodesDict[glyphName]) # glyphWidths[glyphUnicode] = [ttfont['hmtx'][glyphName][0], glyphName] glyphWidths[glyphUnicode] = ttfont['hmtx'][glyphName][0] # else: # glyphWidths[glyphName] = [ttfont['hmtx'][glyphName][0]] filename, file_extension = os.path.splitext(font_path) jsonPath = f'{filename}-glyph_widths.json' saveToJSON(glyphWidths, jsonPath)
def parse_front_html(self, tmpe_file, html): new_font_dict = dict() font1 = TTFont(tmpe_file) # font1 = TTFont('./font/new_base.woff') # 读取新的woff文件 ff_list = font1.getGlyphNames() # 返回一个对象 ff_news = font1.getGlyphOrder() for fo in ff_news: fo2 = font1['glyf'][fo] for fff1 in self.num_list: fo3 = self.font['glyf'][fff1] if fo2 == fo3: k = fo.replace("uni", "&#x").lower() v = self.num_dict[fff1] new_font_dict[fo.replace("uni", "&#x").lower()] = self.num_dict[fff1] # html = html.replace(k, str(v)) # for k, v in new_font_dict.items(): # html = html.replace(k, str(v)) return new_font_dict
def findstar(titles): # 加载字体模板 num = [8, 6, 2, 1, 4, 3, 0, 9, 5, 7] data = [] new_font = [] xmlfilepath_temp = os.path.abspath( r"C:\Users\Administrator\Desktop\python-maoyan-spider\Maoyan\com\sider\temp.xml" ) domobj_temp = xmldom.parse(xmlfilepath_temp) elementobj_temp = domobj_temp.documentElement subElementObj = elementobj_temp.getElementsByTagName("TTGlyph") for i in range(len(subElementObj)): rereobj = re.compile(r"name=\"(.*)\"") find_list = rereobj.findall(str(subElementObj[i].toprettyxml())) data.append( str(subElementObj[i].toprettyxml()).replace(find_list[0], '').replace("\n", '')) #根据字体模板解码本次请求下载的字体 xmlfilepath_find = os.path.abspath( r"C:\Users\Administrator\Desktop\python-maoyan-spider\Maoyan\com\sider\to.xml" ) domobj_find = xmldom.parse(xmlfilepath_find) elementobj_find = domobj_find.documentElement tunicode = elementobj_find.getElementsByTagName("TTGlyph") for i in range(len(tunicode)): th = tunicode[i].toprettyxml() report = re.compile(r"name=\"(.*)\"") find_this = report.findall(th) get_code = th.replace(find_this[0], '').replace("\n", '') for j in range(len(data)): if get_code == data[j]: new_font.append(num[j]) font = TTFont( r"C:\Users\Administrator\Desktop\python-maoyan-spider\Maoyan\com\sider\demo.woff" ) font_list = font.getGlyphNames() font_list.remove('glyph00000') font_list.remove('x') for i in range(len(font_list)): font_list[i] = str(font_list[i]).lower().replace("uni", '') return (new_font, font_list)