parse_srcs = ( Composable() # :: List[Optional[Path]] >> F(filter, is_not_none) # :: List[Path] # >> F(filter, lambda phrase: str.startswith(phrase, '/media')) >> F(filter, is_media_path) >> F(map, tryit(split_path_parts)) # :: List[Optional(PathParts)] # >> mapper(tryit(split_path_parts)) >> F(filter, is_not_none) # :: List[PathParts] >> F(map, getitem(1)) # :: List[Path] # >> mapper(getitem(1)) ) # Combine get_img_srcs with get_css_srcs, and parse resultant paths fetch_full_paths = ( Composable() # :: Location # >> cache(branch(get_img_srcs, get_css_srcs)) # :: (List[Path], List[Path]) >> branch(get_img_srcs, get_css_srcs) # :: (List[Path], List[Path]) >> combine # :: List[Path] >> unique # :: List[Path] ) fetch_paths = fetch_full_paths >> parse_srcs # :: List[Path] executor = fetch_paths >> F(map, sync_media) # Location -> Side Effects! impure! impure! def main(location): """ Pull down all images referenced in a given HTML URL or file.""" return executor(location) if __name__ == "__main__": import sys main(sys.argv[1])
def crop(prefix): def wrapped(word): if word.startswith(prefix): return word[len(prefix):] return wrapped remove_prefix = maybe(crop('http://opeterml1297110.njgroup.com:7000'), maybe(crop('http://cdn.theatlantic.com/assets/'), maybe(crop('https://cdn.theatlantic.com/assets/'), lambda word: word))) # Read data input, with no processing data_read = (Pipe() >> branch(get_html, read_page) >> cache) # Composite work-horse functions # Retreive src-like properties from <img> tags get_img_srcs = ( Pipe() # :: Location >> get_html # :: ElementTree >> img_tags # :: List[Element] >> partial(map, get_src) # List[Optional[Path]] ) # Retreive URL-paths from CSS 'background-image:' properties get_css_srcs = ( Pipe() # :: Location >> path_to_url # :: URLString >> read_page # :: HTMLString
Chainable() # :: Location >> read_page # :: str >> BACKGROUND_IMAGE_REGEX.findall # :: List[str] ) # Format relative paths for sync-media parse_srcs = ( Chainable() # :: List[Optional[Path]] >> partial(filter, is_not_none) # :: List[Path] >> partial(filter, is_media_path) >> partial(map, tryit(split_path_parts)) # :: List[Optional(PathParts)] >> partial(filter, is_not_none) # :: List[PathParts] >> partial(map, getitem(1)) # :: List[Path] ) # Combine get_img_srcs with get_css_srcs, and parse resultant paths fetch_full_paths = ( Chainable() # :: Location >> cache(branch(get_img_srcs, get_css_srcs)) # :: (List[Path], List[Path]) >> combine # :: List[Path] >> unique # :: List[Path] ) fetch_paths = fetch_full_paths >> parse_srcs # :: List[Path] executor = fetch_paths >> partial(map, sync_media) # Location -> Side Effects! impure! impure! def main(location): """ Pull down all images referenced in a given HTML URL or file.""" return executor(location) if __name__ == "__main__": import sys main(sys.argv[1])