parse_srcs = ( Pipe() # :: List[Optional[Path]] >> partial(filter, is_not_none) # :: List[Path] >> partial(filter, is_media_path) >> partial(map, tryit(split_path_parts)) # :: List[Optional(PathParts)] >> partial(filter, is_not_none) # :: List[PathParts] >> partial(map, getitem(1)) # :: List[Path] ) cleanup_paths = ( Pipe() # :: List[Path] >> partial(map, fixup_cdn) ) # Combine get_img_srcs with get_css_srcs, and parse resultant paths fetch_full_paths = ( Pipe() # :: Location >> cache(branch(get_img_srcs, get_css_srcs)) # :: (List[Path], List[Path]) >> combine # :: List[Path] >> unique # :: List[Path] ) fetch_paths = ( fetch_full_paths >> partial(map, remove_prefix) # List[Path] >> parse_srcs # :: List[Path] ) executor = fetch_paths >> partial(map, sync_media) # Location -> Side Effects! impure! impure! def main(location): """ Pull down all images referenced in a given HTML URL or file.""" return executor(location)
get_src = maybe(getitem('src'), # normal maybe(getitem('data-src'), # photo pages maybe(getitem('data-srcset')))) # home page BACKGROUND_IMAGE_REGEX = re.compile(r'background-image: url\((.*?\))') read_page = lambda url: urllib2.urlopen(url).read() PATH_SPLIT_REGEX = re.compile(r'^(/media)/(.*)/(.*?\..*?)$') # PATH_SPLIT_REGEX = re.compile(r'/media/(.*)/(') split_path_parts = lambda src: PATH_SPLIT_REGEX.match(src).groups() # :: str -> PathPats MEDIA_PATH_REGEX = re.compile(r'/media/') is_media_path = lambda path: MEDIA_PATH_REGEX.search(path) is not None # Composite work-horse functions # Retreive src-like properties from <img> tags get_img_srcs = ( Composable() # :: Location >> cache(get_html) # :: ElementTree >> img_tags # :: List[Element] >> F(map, get_src) # List[Optional[Path]] ) # Retreive URL-paths from CSS 'background-image:' properties get_css_srcs = ( Composable() # :: Location >> cache(read_page) # :: str >> BACKGROUND_IMAGE_REGEX.findall # :: List[str] ) # Format relative paths for sync-media parse_srcs = ( Composable() # :: List[Optional[Path]] >> F(filter, is_not_none) # :: List[Path] # >> F(filter, lambda phrase: str.startswith(phrase, '/media')) >> F(filter, is_media_path)