parse_srcs = (
    Pipe()  # :: List[Optional[Path]]
    >> partial(filter, is_not_none)  # :: List[Path]
    >> partial(filter, is_media_path)
    >> partial(map, tryit(split_path_parts))  # :: List[Optional(PathParts)]
    >> partial(filter, is_not_none)  # :: List[PathParts]
    >> partial(map, getitem(1))  # :: List[Path]
)
cleanup_paths = (
    Pipe()  # :: List[Path]
    >> partial(map, fixup_cdn)
)
# Combine get_img_srcs with get_css_srcs, and parse resultant paths
fetch_full_paths = (
    Pipe()  # :: Location
    >> cache(branch(get_img_srcs, get_css_srcs))  # :: (List[Path], List[Path])
    >> combine  # :: List[Path]
    >> unique  # :: List[Path]
)
fetch_paths = (
    fetch_full_paths
    >> partial(map, remove_prefix)  # List[Path]
    >> parse_srcs  # :: List[Path]
)
executor = fetch_paths >> partial(map, sync_media)  # Location -> Side Effects! impure! impure!


def main(location):
    """ Pull down all images referenced in a given HTML URL or file."""
    return executor(location)
Exemplo n.º 2
0
get_src = maybe(getitem('src'),     # normal
            maybe(getitem('data-src'),  # photo pages
                maybe(getitem('data-srcset'))))  # home page
BACKGROUND_IMAGE_REGEX = re.compile(r'background-image: url\((.*?\))')
read_page = lambda url: urllib2.urlopen(url).read()
PATH_SPLIT_REGEX = re.compile(r'^(/media)/(.*)/(.*?\..*?)$')
# PATH_SPLIT_REGEX = re.compile(r'/media/(.*)/(')
split_path_parts = lambda src: PATH_SPLIT_REGEX.match(src).groups()  # :: str -> PathPats
MEDIA_PATH_REGEX = re.compile(r'/media/')
is_media_path = lambda path: MEDIA_PATH_REGEX.search(path) is not None

# Composite work-horse functions
# Retreive src-like properties from <img> tags
get_img_srcs = (
    Composable()  # :: Location
    >> cache(get_html)  # :: ElementTree
    >> img_tags  # :: List[Element]
    >> F(map, get_src)  # List[Optional[Path]]
)
# Retreive URL-paths from CSS 'background-image:' properties
get_css_srcs = (
    Composable()  # :: Location
    >> cache(read_page)  # :: str
    >> BACKGROUND_IMAGE_REGEX.findall  # :: List[str]
)
# Format relative paths for sync-media
parse_srcs = (
    Composable()  # :: List[Optional[Path]]
    >> F(filter, is_not_none)  # :: List[Path]
    # >> F(filter, lambda phrase: str.startswith(phrase, '/media'))
    >> F(filter, is_media_path)