def render(item): if isinstance(item, LTPage): self.outfp.write( '<page id="%s" bbox="%s" rotate="%d">\n' % (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) self.outfp.write('</page>\n') elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTPolygon): self.outfp.write( '<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) for child in item: render(child) self.outfp.write('</textline>\n') elif isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' % (item.index, bbox2str(item.bbox), wmode)) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTChar): self.outfp.write( '<text font="%s" bbox="%s" size="%.3f">' % (enc(item.fontname), bbox2str(item.bbox), item.size)) self.write_text(item.text) self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write('<text>%s</text>\n' % item.text) elif isinstance(item, LTImage): if self.outdir: name = self.write_image(item) self.outfp.write( '<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.outfp.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) else: assert 0, item return
def render(item): if isinstance(item, LTPage): self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) if item.groups is not None: self.outfp.write('<layout>\n') for group in item.groups: show_group(group) self.outfp.write('</layout>\n') self.outfp.write('</page>\n') elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTCurve): self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) for child in item: render(child) self.outfp.write('</textline>\n') elif isinstance(item, LTTextBox): wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' % (item.index, bbox2str(item.bbox), wmode)) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTChar): self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' % (enc(item.fontname), bbox2str(item.bbox), item.size)) self.write_text(item.get_text()) self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write('<text>%s</text>\n' % item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.outfp.write('<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.outfp.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) else: assert 0, item return
def begin_tag(self, tag, props=None): s = '' if props: s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v) in sorted(props.iteritems()) ) self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.tag = tag return
def begin_tag(self, tag, props=None): s = '' if isinstance(props, dict): s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v) in sorted(props.iteritems())) self.outfp.write('<%s%s>' % (enc(tag.name), s)) self._stack.append(tag) return
def begin_tag(self, tag, props=None): s = '' if props: s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v) in sorted(props.iteritems())) self.outfp.write('<%s%s>' % (enc(tag.name), s)) self.tag = tag return
def render(item): if isinstance(item, LTPage): self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) self.outfp.write('</page>\n') elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, bbox2str(item.bbox))) elif isinstance(item, LTPolygon): self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, bbox2str(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure name="%s" bbox="%s">\n' % (item.name, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox)) for child in item: render(child) self.outfp.write('</textline>\n') elif isinstance(item, LTTextBox): self.outfp.write('<textbox id="%d" bbox="%s">\n' % (item.index, bbox2str(item.bbox))) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTChar): vertical = '' if item.is_vertical(): vertical = 'vertical="true" ' self.outfp.write('<text font="%s" %sbbox="%s" size="%.3f">' % (enc(item.font.fontname), vertical, bbox2str(item.bbox), item.get_size())) self.write(item.text) self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write('<text>%s</text>\n' % item.text) elif isinstance(item, LTImage): if self.outdir: name = self.write_image(item) self.outfp.write('<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.outfp.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) else: assert 0, item return
def gen_url(self): while True: try: videos = db.video.find( { '$or': [{ 'tag': [] }, { 'tag': { '$exists': False, } }] }, { 'aid': 1, 'bvid': 1 }).limit(100) for each_video in videos: if 'bvid' in each_video: bvid = each_video['bvid'] else: bvid = enc(each_video['aid']) yield 'https://www.bilibili.com/video/BV{}'.format(bvid) except Exception as e: logging.exception(e) sleep(10)
async def gen_url(self): while True: try: last_tag = await self.async_db.video_info.find_one( {'tag': { '$exists': True }}, { 'aid': 1, 'bvid': 1 }, sort=[('_id', -1)]) videos = self.async_db.video_info.find( { '_id': { '$gt': last_tag['_id'] } }, { 'aid': 1, 'bvid': 1 }).limit(30) flag = 0 async for each_video in videos: flag = 1 if 'bvid' in each_video: bvid = each_video['bvid'] else: bvid = enc(each_video['aid']) yield 'https://api.bilibili.com/x/tag/archive/tags?bvid={}'.format( bvid) if flag == 0: await asyncio.sleep(1) except Exception as e: logging.exception(e)
def update_video_interval(self, interval: int, aid, bvid): now = datetime.utcnow() + timedelta(hours=8) if aid == None: aid = enc(bvid) if bvid == None: bvid = dec(aid) return {'next': now, 'interval': interval, 'aid': aid, 'bvid': bvid}
def render(item): if isinstance(item, LTPage): self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' % (item.id, strbbox(item.bbox), item.rotate)) for child in item: render(child) self.outfp.write('</page>\n') elif isinstance(item, LTLine) and item.direction: self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />\n' % (item.linewidth, item.direction, strbbox(item.bbox))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' % (item.linewidth, strbbox(item.bbox))) elif isinstance(item, LTPolygon): self.outfp.write('<polygon linewidth="%d" bbox="%s" pts="%s"/>\n' % (item.linewidth, strbbox(item.bbox), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): self.outfp.write('<textline bbox="%s">\n' % strbbox(item.bbox)) for child in item: render(child) self.outfp.write('</textline>\n') elif isinstance(item, LTTextBox): self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, strbbox(item.bbox))) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTTextItem): self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' % (enc(item.font.fontname), item.is_vertical(), strbbox(item.bbox), item.fontsize)) self.write(item.text) self.outfp.write('</text>\n') elif isinstance(item, LTText): self.outfp.write('<text>%s</text>\n' % item.text) elif isinstance(item, LTImage): x = '' if self.outdir: name = self.write_image(item) if name: x = 'name="%s" ' % enc(name) self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' % (x, item.type, item.width, item.height)) else: assert 0, item return
def video_gen(self): while True: # 如果存在锁 if self.db.lock.count_documents({"name": "video_interval"}): sleep(0.1) continue # 挂锁 self.db.lock.insert({ "name": "video_interval", "date": datetime.datetime.utcnow() }) try: d = [] data = self.db.video_interval.find({ 'order': { '$exists': True, '$ne': [] } }).hint("idx_order").limit(100) for each in data: d.append(each) data = self.db.video_interval.find({ 'next': { '$lt': datetime.datetime.utcnow() } }).limit(100) for each in data: d.append(each) for data in d: # 如果存在手动操作,则刷新数据 if 'order' in data: for order_id in data['order']: self.db.user_record.update_one( {'_id': order_id}, {'$set': { 'isExecuted': True }}) data['next'] = data['next'] + \ datetime.timedelta(seconds=data['interval']) data['order'] = [] if 'bvid' not in data: bvid = enc(data['aid']).lstrip("BV") data['bvid'] = bvid if 'aid' not in data: aid = dec(data['bvid']) data['aid'] = aid self.db.video_interval.update_one( { 'aid': data['aid'], 'bvid': data['bvid'] }, {'$set': data}) # 解锁 self.db.lock.delete_one({"name": "video_interval"}) for data in d: yield data except Exception as e: self.logger.exception(e)
def place_image(self, item, borderwidth, x, y, w, h): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" ' 'width="%d" height="%d" />\n' % (enc(name), borderwidth, x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return
def update_video_interval(self, interval: int, aid, bvid): if aid == None: aid = enc(bvid) if bvid == None: bvid = dec(aid) return { 'next': datetime.utcfromtimestamp(0), 'interval': interval, 'aid': aid, 'bvid': bvid }
def render_string(self, textstate, seq): font = textstate.font text = '' for obj in seq: if not isinstance(obj, str): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: pass return enc(text, self.codec)
def render_string(self, textstate, seq): font = textstate.font text = '' for obj in seq: if not isinstance(obj, str): continue chars = font.decode(obj) for cid in chars: try: char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: pass self.outfp.write(enc(text, self.codec)) return
def write_image(self, image): if image.type in LITERALS_DCT_DECODE: ext = '.jpg' else: return name = image.name+ext path = os.path.join(self.outdir, name) fp = file(path, 'wb') fp.write(image.data) fp.close() self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" ' 'width="%d" height="%d" />\n' % (enc(name), image.x0*self.scale, (self.yoffset-image.y1)*self.scale, image.width*self.scale, image.height*self.scale)) return
def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height) if self.showpageno: self.outfp.write( '<div style="position:absolute; top:%dpx;">' % ((self.yoffset - item.y1) * self.scale)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (page.pageid, page.pageid)) for child in item: render(child) elif isinstance(item, LTChar): self.write_text(item.text, item.x0, item.y1, item.get_size()) if self.debug: self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTPolygon): self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTTextLine): for child in item: render(child) elif isinstance(item, LTTextBox): self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) if self.debug: self.write_text(str(item.index + 1), item.x0, item.y1, 20) elif isinstance(item, LTFigure): self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): if self.outdir: name = self.write_image(item) self.outfp.write( '<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" ' 'width="%d" height="%d" />\n' % (enc(name), item.x0 * self.scale, (self.yoffset - item.y1) * self.scale, item.width * self.scale, item.height * self.scale)) return
def render(item): if isinstance(item, LTPage): self.yoffset += item.y1 self.write_rect('gray', 1, item.x0, item.y1, item.width, item.height) if self.showpageno: self.outfp.write('<div style="position:absolute; top:%dpx;">' % ((self.yoffset-item.y1)*self.scale)) self.outfp.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid)) for child in item: render(child) elif isinstance(item, LTChar): self.write_text(item.text, item.x0, item.y1, item.get_size()) if self.debug: self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTPolygon): self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTTextLine): self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTTextBox): self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) if self.debug: self.write_text(str(item.index+1), item.x0, item.y1, 20) elif isinstance(item, LTFigure): self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): if self.outdir: name = self.write_image(item) self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" ' 'width="%d" height="%d" />\n' % (enc(name), item.x0*self.scale, (self.yoffset-item.y1)*self.scale, item.width*self.scale, item.height*self.scale)) return
def video_gen_without_lock(self): while True: try: d = [] data = self.db.video_interval.find({ 'next': { '$lt': datetime.datetime.utcnow() } }).hint("idx_next").limit(100) for each in data: d.append(each) for data in d: # 如果存在手动操作,则刷新数据 if 'order' in data: for order_id in data['order']: self.db.user_record.update_one( {'_id': order_id}, {'$set': { 'isExecuted': True }}) data['next'] = data['next'] + \ datetime.timedelta(seconds=data['interval']) data['order'] = [] if 'bvid' not in data: bvid = enc(data['aid']).lstrip("BV") data['bvid'] = bvid if 'aid' not in data: aid = dec(data['bvid']) data['aid'] = aid self.db.video_interval.update_one( { 'aid': data['aid'], 'bvid': data['bvid'] }, {'$set': data}) for data in d: yield data except Exception as e: self.logger.exception(e)
async def video_gen_without_lock(self): last_data = set() batch = 2000 while True: try: d = [] data = self.async_db.video_interval.find({ 'next': { '$lte': datetime.datetime.utcnow() } }).sort([('next', 1)]).hint("idx_next").limit(batch) async for each in data: if 'aid' not in each and 'bvid' in each and 'bvid' != '': each['aid'] = dec('BV' + each['bvid'].lstrip('BV')) elif 'bvid' not in each and 'aid' in each or 'bvid' == '': each['bvid'] = enc(each['aid']).lstrip('BV') elif 'aid' in each and 'bvid' in each and each[ 'aid'] != None and type( each['aid']) != str and each['aid'] > 0: pass else: await self.async_db.video_interval.delete_one( {'_id': each['_id']}) d.append(each) for data in d: if 'aid' not in data: continue if data['aid'] not in last_data: last_data.add(data['aid']) yield data last_data = set() if len(d) < batch / 2: await asyncio.sleep(10) except Exception as e: self.logger.exception(e)
def write_text(self, text): self.outfp.write(enc(text, self.codec)) return
async def video_gen(self): while True: # 如果存在锁 if await self.async_db.lock.count_documents( {"name": "video_interval"}): sleep(0.1) continue # 挂锁 await self.async_db.lock.insert_one({ "name": "video_interval", "date": datetime.datetime.utcnow() }) try: d = [] data = await self.async_db.video_interval.find({ 'order': { '$exists': True, '$ne': [] } }).hint("idx_order").limit(100) for each in data: d.append(each) data = await self.async_db.video_interval.find({ 'next': { '$lt': datetime.datetime.utcnow() } }).limit(100) for each in data: d.append(each) for data in d: # 如果存在手动操作,则刷新数据 if 'order' in data: for order_id in data['order']: await self.async_db.user_record.update_one( {'_id': order_id}, {'$set': { 'isExecuted': True }}) data['next'] = data['next'] + \ datetime.timedelta(seconds=data['interval']) data['order'] = [] try: if 'aid' not in data: data['aid'] = dec('BV' + data['bvid']) filt = {'bvid': data['bvid']} elif 'bvid' not in data: data['bvid'] = enc(data['aid']).lstrip("BV") filt = {'aid': data['aid']} else: filt = {'bvid': data['bvid']} except Exception: if 'aid' in data: await self.async_db.video_interval.delete_many( {'aid': data['aid']}) else: await self.async_db.video_interval.delete_many( {'bvid': data['bvid']}) continue if await self.async_db.video_interval.count(filt) > 1: await self.async_db.video_interval.delete_many(filt) await self.async_db.video_interval.update_one( filt, {'$set': data}) # 解锁 await self.async_db.lock.delete_one({"name": "video_interval"}) for data in d: yield data except Exception as e: self.logger.exception(e)
def end_tag(self): assert self._stack tag = self._stack.pop(-1) self.outfp.write('</%s>' % enc(tag.name)) return
def end_tag(self): assert self.tag self.outfp.write('</%s>' % enc(self.tag.name)) self.tag = None return
from db import db from utils import dec, enc for each_video in db.video.find({'bvid': {'$exists': False}}, {'aid': 1, 'bvid': 1}).batch_size(20): bvid = enc(each_video['aid']).lstrip('BV') db.video.update_one({'aid': each_video['aid']}, {'$set': { 'bvid': bvid }}) print(each_video['aid'], bvid) pass
def write_text(self, text): if self.stripcontrol: text = self.CONTROL.sub(u'', text) self.outfp.write(enc(text, self.codec)) return
def render(item): if isinstance(item, LTPage): # Get max Y coord self._yoffset = item.y1 self.outfp.write('<page number="%s" id="%s" %s rotate="%d">\n' % (item.pageid, item.pageid, self.scaled_bbox(item), item.rotate)) for child in item: render(child) if item.groups is not None: if self.simplerOutput: for group in item.groups: show_group(group) else: self.outfp.write('<layout>\n') for group in item.groups: show_group(group) self.outfp.write('</layout>\n') self.outfp.write('</page>\n') elif isinstance(item, LTLine): self.outfp.write('<line linewidth="%d" %s />\n' % (item.linewidth, self.scaled_bbox(item))) elif isinstance(item, LTRect): self.outfp.write('<rect linewidth="%d" %s />\n' % (item.linewidth, self.scaled_bbox(item))) elif isinstance(item, LTCurve): self.outfp.write('<curve linewidth="%d" %s pts="%s" />\n' % (item.linewidth, self.scaled_bbox(item), item.get_pts())) elif isinstance(item, LTFigure): self.outfp.write('<figure name="%s" %s>\n' % (item.name, self.scaled_bbox(item))) for child in item: render(child) self.outfp.write('</figure>\n') elif isinstance(item, LTTextLine): tagName = 'textline' if self.simplerOutput: tagName = 'text' self.outfp.write('<%s %s>\n' % (tagName, self.scaled_bbox(item))) for child in item: render(child) self.outfp.write('</%s>\n' % tagName) elif isinstance(item, LTTextBox): if self.simplerOutput: for child in item: render(child) else: wmode = '' if isinstance(item, LTTextBoxVertical): wmode = ' wmode="vertical"' self.outfp.write('<textbox id="%d" %s %s>\n' % (item.index, self.scaled_bbox(item), wmode)) for child in item: render(child) self.outfp.write('</textbox>\n') elif isinstance(item, LTChar): if self.layoutmode == 'exact': self.outfp.write('<text font="%s" %s size="%.3f">' % (enc(item.fontname), self.scaled_bbox(item), item.size)) self.write_text(item.get_text()) self.outfp.write('</text>\n') else: self.write_text(item.get_text()) elif isinstance(item, LTText): if self.layoutmode == 'exact': self.outfp.write('<text>%s</text>\n' % item.get_text()) else: self.write_text(item.get_text()) elif isinstance(item, LTImage): if self.imagewriter is not None: name = self.imagewriter.export_image(item) self.outfp.write('<image src="%s" width="%d" height="%d" />\n' % (enc(name), item.width, item.height)) else: self.outfp.write('<image width="%d" height="%d" />\n' % (item.width, item.height)) else: assert 0, item return