def localize_misc(dom, path, prefix='misc'): ''' A few standalone resources to localize, including an xml, an ico and an iframe ''' misc_path = os.path.join(path, prefix) if not os.path.exists(misc_path): os.mkdir(misc_path) re_search = re.compile( '<link rel="search" type="application/opensearchdescription\+xml" \ href="(?P<url>http://.*?\.xml)" title="Facebook">') m_search = re_search.search(dom) url = m_search.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) re_ico = re.compile( '<link rel="shortcut icon" href="(?P<url>http://.*?\.ico)">') m_ico = re_ico.search(dom) url = m_ico.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) re_uicif = re.compile('<iframe src="(?P<url>http://.*?\.html)"') m_uicif = re_uicif.search(dom) if m_uicif: url = m_uicif.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) # redirect the rest of the hrefs to about:blank (most of them are hyperlinks) # (call this after localize_css!) dom = re.sub('href="http://.+?"', 'href="about:blank"', dom) return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} if os.path.isfile( os.path.join(path, filename.rstrip('html') + 'img_mapping')): # already have garbled images, only replace filenames in dom img_mapping = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') img_mapping = img_mapping.split('\n') for mapping in img_mapping: img_file, new_file = mapping.split(': ') dom = dom.replace(img_file, new_file) else: # garbled image not generated yet for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) st_mapping = [] for key, val in images.items(): st_mapping.append(key + ': ' + val) FBParser.save_content( '\n'.join(st_mapping), os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) os.remove(os.path.join(path, img_file)) st_mapping = [] for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom