def get_movies(site_content): logger().info("Get movies") movies_section: str = MOVIES_POSTER movies: dict[str:list] = {} html_site_elements_content = Cleaner(style=True).clean_html( html.fromstring(site_content)) for movie in html_site_elements_content.xpath( MOVIE_TICKETS_BLOCK): # type: HtmlElement movie_block_tag = movie.tag if movie_block_tag == DIV_TAG: movies_section = movie.xpath( XPATH_GET_TEXT.format(xpath=MOVIES_TITLE_BLOCK))[0].strip() continue movies_poster = [] if movies.get( movies_section) is None else movies[movies_section] movies[movies_section] = movies_poster + fetch_movies(movie) return movies
def parse_part(env, part, msg_id, inner=False): content = OrderedDict([('files', []), ('attachments', []), ('embedded', {}), ('html', '')]) ctype = part.get_content_type() mtype = part.get_content_maintype() stype = part.get_content_subtype() if part.is_multipart(): for m in part.get_payload(): child = parse_part(env, m, msg_id, True) child_html = child.pop('html', '') child_text = child.pop('text', '') content.setdefault('html', '') content.setdefault('text', '') if stype != 'alternative': content['html'] += child_html content['text'] += child_text else: content['html'] = child_html or content['html'] content['text'] = child_text or content['text'] content['files'] += child.pop('files') content.update(child) elif mtype == 'multipart': text = part.get_payload(decode=True) text = decode_str(text, part.get_content_charset(), msg_id=msg_id) content['html'] = text elif ctype in ['text/html', 'text/plain']: text = part.get_payload(decode=True) text = decode_str(text, part.get_content_charset(), msg_id=msg_id) if ctype == 'text/html': content['html'] = text elif ctype == 'text/plain' and not content['html']: text = html.unescape(text) text = re.sub(r'<[^>]+>', '', text) content['text'] = text text = text2html(text) content['html'] = text else: payload = part.get_payload(decode=True) filename = part.get_filename() filename = decode_header(filename, msg_id) if filename else ctype attachment = { 'mimetype': ctype, 'id': part.get('Content-ID'), 'filename': filename, 'payload': payload, 'size': len(payload) if payload else None } content['files'] += [attachment] content['html'] = clean_html(content['html']) if inner: return content content.update(attachments=[], embedded={}) for index, item in enumerate(content['files']): if item['payload']: name = slugify(item['filename'] or item['id']) path = '/'.join([slugify(msg_id), str(index), name]) asset = env.files.to_db(path, item['mimetype'], item['filename']) if item['id']: content['embedded'][item['id']] = asset elif item['filename']: content['attachments'].append(asset) else: log.warn('UnknownAttachment(%s)', msg_id) continue env.files.write(path, item['payload']) if content['html']: htm = lh.fromstring(content['html']) # Fix img[@src] embedded = dict(content['embedded']) for img in htm.xpath('//img[@src]'): src = img.attrib.get('src') cid = re.match('^cid:(.*)', src) obj = cid and embedded.pop('<%s>' % cid.group(1), None) if obj: cid = cid.group(1) img.attrib['src'] = env.files.url(obj['path']) elif not re.match('^(https?://|/|data:image/).*', src): del img.attrib['src'] content['attachments'] += embedded.values() content['html'] = lh.tostring(htm, encoding='utf-8').decode() if 'text' not in content or not content['text']: htm = Cleaner(links=False, style=True).clean_html(htm) text = '\n'.join(htm.xpath('//text()')) content['text'] = text.strip() content['text'] = content.get('text') or '' return content
def parse_part(env, part, msg_id, inner=False): content = OrderedDict([ ('files', []), ('attachments', []), ('embedded', {}), ('html', '') ]) ctype = part.get_content_type() mtype = part.get_content_maintype() stype = part.get_content_subtype() if part.is_multipart(): for m in part.get_payload(): child = parse_part(env, m, msg_id, True) child_html = child.pop('html', '') child_text = child.pop('text', '') content.setdefault('html', '') content.setdefault('text', '') if stype != 'alternative': content['html'] += child_html content['text'] += child_text else: content['html'] = child_html or content['html'] content['text'] = child_text or content['text'] content['files'] += child.pop('files') content.update(child) elif mtype == 'multipart': text = part.get_payload(decode=True) text = decode_str(text, part.get_content_charset(), msg_id=msg_id) content['html'] = text elif ctype in ['text/html', 'text/plain']: text = part.get_payload(decode=True) text = decode_str(text, part.get_content_charset(), msg_id=msg_id) if ctype == 'text/html': content['html'] = text elif ctype == 'text/plain' and not content['html']: text = html.unescape(text) text = re.sub(r'<[^>]+>', '', text) content['text'] = text text = text2html(text) content['html'] = text else: payload = part.get_payload(decode=True) filename = part.get_filename() filename = decode_header(filename, msg_id) if filename else ctype attachment = { 'mimetype': ctype, 'id': part.get('Content-ID'), 'filename': filename, 'payload': payload, 'size': len(payload) if payload else None } content['files'] += [attachment] content['html'] = clean_html(content['html']) if inner: return content content.update(attachments=[], embedded={}) for index, item in enumerate(content['files']): if item['payload']: name = slugify(item['filename'] or item['id']) path = '/'.join([slugify(msg_id), str(index), name]) asset = env.files.to_db(path, item['mimetype'], item['filename']) if item['id']: content['embedded'][item['id']] = asset elif item['filename']: content['attachments'].append(asset) else: log.warn('UnknownAttachment(%s)', msg_id) continue env.files.write(path, item['payload']) if content['html']: htm = lh.fromstring(content['html']) # Fix img[@src] embedded = dict(content['embedded']) for img in htm.xpath('//img[@src]'): src = img.attrib.get('src') cid = re.match('^cid:(.*)', src) obj = cid and embedded.pop('<%s>' % cid.group(1), None) if obj: cid = cid.group(1) img.attrib['src'] = env.files.url(obj['path']) elif not re.match('^(https?://|/|data:image/).*', src): del img.attrib['src'] content['attachments'] += embedded.values() content['html'] = lh.tostring(htm, encoding='utf-8').decode() if 'text' not in content or not content['text']: htm = Cleaner(links=False, style=True).clean_html(htm) text = '\n'.join(htm.xpath('//text()')) content['text'] = text.strip() content['text'] = content.get('text') or '' return content