def no_fonts( pq ): # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels raise Exception, "yuk - it's a mess, use tidy!" pq = PyQuery(pq) #print fonts.__class__.__name__ for font in pq('font'): font = PyQuery(font) #font ('a').remove() #print font.__class__.__name__ #print len (font), font [0] #print dir (font) #import sys #sys.exit() #inner = innerhtml (font) # .text() #.replace (':','').strip() #print 'Replacing font with:', font.html() font.replaceWith(font.html()) #font.getparent().replace (font, PyQuery (inner)) print 'font replaced:', font[:60] #font = no_fonts (font) for font in pq('font'): font = PyQuery(font) font.replaceWith(font.html()) print 'font 2 replaced:', font[:60] return pq
def no_fonts (pq): # yuk - lxml etree and PyQuery objs get confused - nested ones arent removed, this goes only 2 levels raise Exception, "yuk - it's a mess, use tidy!" pq = PyQuery (pq) #print fonts.__class__.__name__ for font in pq ('font'): font = PyQuery (font) #font ('a').remove() #print font.__class__.__name__ #print len (font), font [0] #print dir (font) #import sys #sys.exit() #inner = innerhtml (font) # .text() #.replace (':','').strip() #print 'Replacing font with:', font.html() font.replaceWith (font.html()) #font.getparent().replace (font, PyQuery (inner)) print 'font replaced:', font [:60] #font = no_fonts (font) for font in pq ('font'): font = PyQuery (font) font.replaceWith (font.html()) print 'font 2 replaced:', font [:60] return pq
def __processInstagramTag(self, i, e): obj = PyQuery(e) url = obj('a').attr('href') shortCode = re.match("http://.*/p/(.*)/", url).group(1) imageUrl = self.getInstagramImageUrl(shortCode) newObj = PyQuery("<img />") newObj.attr('src', imageUrl) obj.replaceWith(newObj)
def __processImageTag(self, i, e): obj = PyQuery(e) style = obj.attr('style') if style != None and style.find('display: none') != -1: obj.remove() return newObj = PyQuery("<img />") newObj.attr('src', obj.attr('rel:bf_image_src')) newObj.attr('style', obj.attr('style')) newObj.width(obj.width()) newObj.height(obj.height()) obj.replaceWith(newObj)
def clean_body(body): site = Site.objects.get_current() html = PyQuery('<body>' + body + '</body>') for p in html('p'): p = PyQuery(p) p.replaceWith('\n\n%s\n\n' % p.html()) html('.alignright').addClass('pull-right').removeClass('alignright') html('.alignleft').addClass('pull-left').removeClass('alignleft') html('[style="float: left;"]').removeAttr('style').addClass('alignleft') html('[style="float: right;"]').removeAttr('style').addClass('alignright') while '\n\n\n' in body: body = body.replace('\n\n\n', '\n\n') while '\r\r\r' in body: body = body.replace('\r\r\r', '\r\r') body = html.html() body = body.replace('<br />', ' \n') body = body.replace('<br/>', ' \n') body = body.replace('<br>', ' \n') body = body.replace('\r\n', '\n') body = body.replace('\n\r', '\n') while body.find('\n\n\n') > -1: body = body.replace('\n\n\n', '\n\n') while body.startswith('\n'): body = body[1:] while body.endswith('\n'): body = body[:-1] while body.startswith('\r'): body = body[1:] while body.endswith('\r'): body = body[:-1] while body.startswith('\t'): body = body[1:] return body
def sanitise(self, text, markdown = True): if markdown: text = md(text) dom = PyQuery(text) for a in dom.find('a[href^="javascript:"]'): a = PyQuery(a) a.replaceWith(a.text()) for obj in UNCLEAN_TAGS: dom.find(obj).remove() for attr in UNCLEAN_ATTRS: dom.find('[%s]' % attr).removeAttr(attr) text = dom.outerHtml() if markdown: dom = HTML2Text() text = dom.handle(text) return text
def sanitise(text, markdown=False): if markdown: text = md(text) dom = PyQuery(text) for a in dom.find('a[href^="javascript:"]'): a = PyQuery(a) a.replaceWith(a.text()) for obj in UNCLEAN_TAGS: dom.find(obj).remove() for attr in UNCLEAN_ATTRS: dom.find('[%s]' % attr).removeAttr(attr) text = dom.outerHtml() if markdown: dom = HTML2Text() text = dom.handle(text) return text
def remove_link(index, node): node = PyQuery(node) node.replaceWith(node.text()) return node
def handle(self, *args, **options): xml = ElementTree.parse(open(args[0], 'r')) channel = xml.find('channel') def node_text(node, namespace = None, parent = None): if namespace: item = (parent or channel).find(ns(namespace, node)) else: item = (parent or channel).find(node) if not item is None: return item.text return None def ns(n, o): return '{%s}%s' % (XML_NS[n], o) if channel is None: raise CommandError('Cannot find <channel> tag') title = node_text('title') if title: print(u'Blog title: %s' % title) link = node_text('link') if link: print(u'Blog URL: %s' % link) description = node_text('description') if description: print(u'Blog description: %s' % description) mappings = { 'users': {}, 'posts': {}, 'categories': {}, 'comments': {} } content_type = ContentType.objects.get_for_model(Post) site = Site.objects.get_current() postmeta = {} print with transaction.commit_manually(): try: for author in channel.findall(ns('wp', 'wp_author')): username = node_text('author_login', 'wp', author) email = node_text('author_email', 'wp', author) display_name = node_text('author_display_name', 'wp', author) user = None if not username: continue if display_name: display_name = '%s (%s)' % (username, display_name) else: display_name = username try: user = User.objects.get(username__iexact = username) except User.DoesNotExist: if email: try: user = User.objects.get(email__iexact = email) except: pass if not user: new_username = raw_input('Map old user %s to a user in your database: ' % display_name) if not new_username: continue while True: try: user = User.objects.get(username__iexact = new_username) break except User.DoesNotExist: new_username = raw_input('User not found. Please try again ,or press Enter to ignore: ') if not new_username: print 'Ignoring user %s' % username break if user: mappings['users'][username] = user print 'Mapping user %s to %s' % ( username, user.get_full_name() or user.username ) for item in channel.findall('item'): id = node_text('post_id', 'wp', item) title = node_text('title', parent = item) url = node_text('link', parent = item) kind = node_text('post_type', 'wp', item) parent = node_text('post_parent', 'wp', item) published = node_text('status', 'wp', item) == 'publish' author = node_text('creator', 'dc', item) date = node_text('post_date_gmt', 'wp', item) body = node_text('encoded', 'content', item) or u'' try: id = int(id) except ValueError: continue if not date: continue try: date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S' ).replace( tzinfo = get_current_timezone() ) except: continue try: parent = int(parent) except ValueError: continue if parent: continue if not author: continue if not mappings['users'].has_key(author): continue author = mappings['users'][author] if not kind in ('post', 'page'): continue if kind == 'post': try: post = Post.objects.get(title = title, date = date) print 'Updating %s "%s"' % (kind, title) except Post.DoesNotExist: post = Post( title = title, slug = title and slugify(title) or None, date = date, published = published, broadcast = True, author = author ) print 'Creating %s "%s"' % (kind, title) else: continue post.body = body post.save() mappings['posts'][id] = post for category in item.findall('category'): domain = category.get('domain') slug = category.get('nicename') if not category.text: continue if domain == 'category': if not mappings['categories'].has_key(slug): mappings['categories'][slug], created = Category.objects.get_or_create( name = category.text, slug = slugify(category.text) ) if created: print '- Created category "%s"' % category.text post.categories.add( mappings['categories'][slug] ) elif domain == 'post_tag': if category.text.startswith('"') and category.text.endswith('"'): post.tags.add(category.text[1:-1]) else: post.tags.add(category.text) for comment in item.findall(ns('wp', 'comment')): comment_id = node_text('comment_id', 'wp', comment) comment_name = node_text('comment_author', 'wp', comment) comment_email = node_text('comment_author_email', 'wp', comment) comment_url = node_text('comment_author_url', 'wp', comment) comment_date = node_text('comment_date_gmt', 'wp', comment) comment_type = node_text('comment_type', 'wp', comment) comment_body = node_text('comment_content', 'wp', comment) comment_parent = node_text('comment_parent', 'wp', comment) comment_approved = node_text('comment_approved', 'wp', comment) == '1' try: comment_id = int(comment_id) except ValueError: continue try: comment_parent = int(comment_parent) except ValueError: comment_parent = 0 try: comment_date = datetime.strptime( comment_date, '%Y-%m-%d %H:%M:%S' ).replace( tzinfo = get_current_timezone() ) except: continue if not comment_name: continue if not comment_type or comment_type == 'comment': try: comment = post.comments.get( name = comment_name, sent = comment_date ) except Comment.DoesNotExist: comment = Comment( name = comment_name, website = comment_url, email = comment_email or '', sent = comment_date, approved = comment_approved, body = comment_body, content_type = content_type, object_id = post.pk ) print '- Comment by %s' % comment_name comment.save(notify = False) mappings['comments'][comment_id] = comment postmeta[id] = {} for meta in item.findall(ns('wp', 'postmeta')): meta_key = node_text('meta_key', 'wp', meta) meta_value = node_text('meta_value', 'wp', meta) postmeta[id][meta_key] = meta_value ai = 1 for subitem in channel.findall('item'): subid = node_text('post_id', 'wp', subitem) subparent_id = node_text('post_parent', 'wp', subitem) subtitle = node_text('title', parent = subitem) suburl = node_text('link', parent = subitem) subkind = node_text('post_type', 'wp', subitem) suburl = node_text('attachment_url', 'wp', subitem) try: subparent_id = int(subparent_id) except ValueError: continue if not suburl: continue if subkind != 'attachment' or subparent_id != id: continue s, d, p, a, q, f = urlparse(suburl) d, s, filename = p.rpartition('/') try: attachment = post.attachments.get( title = subtitle or filename ) except Attachment.DoesNotExist: print '- Downloading %s' % filename response = requests.get(suburl) handle, tmp = mkstemp( path.splitext(filename)[-1] ) write(handle, response.content) close(handle) attachment = Attachment( title = subtitle or filename, file = File(open(tmp, 'r'), name = filename), content_type = content_type, object_id = post.pk ) if '_thumbnail_id' in postmeta[id]: if unicode(postmeta[id]['_thumbnail_id']) == unicode(subid): attachment.featured = True attachment.save() remove(tmp) if post.body: html = PyQuery('<body>' + post.body + '</body>') for a in html( 'a[href="%(url)s"], [src="%(url)s"]' % { 'url': suburl } ): a = PyQuery(a) a.replaceWith('\n\n[attachment %d]\n\n' % ai) post.body = html.html() ai += 1 if post.body: html = PyQuery('<body>' + post.body + '</body>') for a in html('a[href]'): href = a.get('href') if href.startswith(link): href = href.replace(link, 'http://%s' % site.domain) a = PyQuery(a) for p in html('p'): p = PyQuery(p) p.replaceWith('\n\n%s\n\n' % p.html()) html('.alignright').addClass('pull-right').removeClass('alignright') html('.alignleft').addClass('pull-left').removeClass('alignleft') while '\n\n\n' in post.body: post.body = post.body.replace('\n\n\n', '\n\n') while '\r\r\r' in post.body: post.body = post.body.replace('\r\r\r', '\r\r') post.body = html.html() post.body = post.body.replace('<br />', ' \n') post.body = post.body.replace('<br/>', ' \n') post.body = post.body.replace('<br>', ' \n') while post.body.startswith('\n'): post.body = post.body[1:] while post.body.endswith('\n'): post.body = post.body[:-1] while post.body.startswith('\r'): post.body = post.body[1:] while post.body.endswith('\r'): post.body = post.body[:-1] while post.body.startswith('\t'): post.body = post.body[1:] post.body = post.body.strip() post.save() transaction.commit() except: transaction.rollback() raise
def search(self, word): response = requests.get(self.URL.format(word=word), headers=headers) text = response.text # たまにhtmlに「𥝱」があって、処理はエラーが発生する text = text.replace('𥝱', '') doc = PyQuery(text) results = [] normal_dict = doc("div.NetDicHead") if normal_dict: for head in normal_dict: result = {'word': word, 'type': 'normal'} head = PyQuery(head) # 括弧(【】)がある場合、漢字か外来語は入ってる match_kakko = re.compile(r"【(.*)】").search(head.text()) if match_kakko: kakko = match_kakko.group(1) match_gairaigo = re.compile(r"[a-zA-Z]").search(kakko) if match_gairaigo: result['gogen'] = kakko result['kana'] = word else: result['kanji'] = kakko result['kana'] = head('b').text().replace(' ', '').replace( '・', '') for accent in head('span'): accent = PyQuery(accent) match_accent = re.compile(r"[([0-9]*)]").search( accent.text()) if match_accent: result['accent'] = result.get( 'accent', '') + match_accent.group(1) + ',' if 'accent' in result: result['accent'] = result['accent'][:-1] body = head.next() for a in body('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = body.html() # 単語自体は仮名のみの場合 if 'kana' not in result: result['kana'] = word results.append(result) Jitsu_dict = doc("div.Jtnhj") if Jitsu_dict: result = {'word': word, 'type': 'Jitsu'} match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?><!--AVOID_CROSSLINK-->別表記" ).search(Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() else: match = re.compile( r"読み方:<!--\/AVOID_CROSSLINK-->(.*)<br/?>").search( Jitsu_dict.html()) if match: result['kana'] = match.group(1) if result['kana'].find('<a') != -1: result['kana'] = PyQuery(result['kana']).text() if Jitsu_dict('.AM'): meaning = PyQuery('<div>') meaning.html(Jitsu_dict('.AM').nextAll()) else: meaning = Jitsu_dict for a in meaning('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = meaning.text() results.append(result) IT_dict = doc('div.Binit') if IT_dict: result = {'word': word, 'type': 'IT'} a = IT_dict('a').eq(0) if a.text().find('読み方') != -1: kana_tag = a.next('a').eq(0) result['kana'] = kana_tag.text().replace(' ', "") else: result['kana'] = word if IT_dict.text().find('【') != -1: result['gogen'] = a.eq(0).text() for p in IT_dict('p'): p = PyQuery(p) for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) if not p.html(): continue result['meaning'] = result.get('meaning', '') + "<p>" + p.html() + "</p>" result['kanji'] = IT_dict.prev("h2.midashigo").text() results.append(result) WIKI = doc('div.Wkpja') if WIKI: result = {'word': word, 'type': 'WIKI'} p = WIKI('p').not_(".WkpjaTs") for a in p('a'): a = PyQuery(a) a.replaceWith(a.html()) result['meaning'] = p.html() result['kanji'] = WIKI.prev("h2.midashigo").text() results.append(result) if results: return {"status": 'success', "results": results} else: return {"status": 'error', "error_detail": "Nothing found."}