Пример #1
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    link = derived_link_info(link)

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            'title': (
                link['title']
                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
            ),
            'archive_url': urlencode(
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
            'extension': link['extension'] or 'html',
            'tags': link['tags'].strip() or 'untagged',
            'status': 'Archived' if link['is_archived'] else 'Not yet archived',
            'status_color': 'success' if link['is_archived'] else 'danger',
        }))

    chmod_file(path)
Пример #2
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    link = derived_link_info(link)

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **link,
            'title': (
                link['title']
                or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
            ),
            'archive_url': (
                wget_output_path(link)
                or (link['domain'] if link['is_archived'] else 'about:blank')
            ),
        }))

    chmod_file(path)
Пример #3
0
def parse_json_link_index(out_dir):
    """load the json link index from a given directory"""
    existing_index = os.path.join(out_dir, 'index.json')
    if os.path.exists(existing_index):
        with open(existing_index, 'r', encoding='utf-8') as f:
            link_json = json.load(f)
            check_link_structure(link_json)
            return link_json
    return {}
Пример #4
0
def write_json_link_index(out_dir, link):
    """write a json file with some info about the link"""
    
    check_link_structure(link)
    path = os.path.join(out_dir, 'index.json')

    with open(path, 'w', encoding='utf-8') as f:
        json.dump(link, f, indent=4, default=str)

    chmod_file(path)
Пример #5
0
def parse_json_links_index(out_dir=OUTPUT_DIR):
    """parse an archive index json file and return the list of links"""
    index_path = os.path.join(out_dir. 'index.json')
    if os.path.exists(index_path):
        with open(index_path, 'r', encoding='utf-8') as f: 
            links = json.load(f)['links']
            check_link_structure(links)
            return links 
    
    return [] 
Пример #6
0
def archive_link(link_dir, link, overwrite=True):
    """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""

    check_link_structure(link)

    try:
        update_existing = os.path.exists(link_dir)
        if update_existing:
            link = {
                **parse_json_link_index(link_dir),
                **link,
            }
        else:
            os.makedirs(link_dir)

        print_link_status_line(link_dir, link, update_existing)

        if FETCH_FAVICON:
            link = fetch_favicon(link_dir, link, overwrite=overwrite)

        if FETCH_TITLE:
            link = fetch_title(link_dir, link, overwrite=overwrite)

        if FETCH_WGET:
            link = fetch_wget(link_dir, link, overwrite=overwrite)

        if FETCH_PDF:
            link = fetch_pdf(link_dir, link, overwrite=overwrite)

        if FETCH_SCREENSHOT:
            link = fetch_screenshot(link_dir, link, overwrite=overwrite)

        if FETCH_DOM:
            link = fetch_dom(link_dir, link, overwrite=overwrite)

        if SUBMIT_ARCHIVE_DOT_ORG:
            link = archive_dot_org(link_dir, link, overwrite=overwrite)

        if FETCH_GIT:
            link = fetch_git(link_dir, link, overwrite=overwrite)

        if FETCH_MEDIA:
            link = fetch_media(link_dir, link, overwrite=overwrite)

        write_link_index(link_dir, link)

    except Exception as err:
        print('    ! Failed to archive link: {}: {}'.format(
            err.__class__.__name__, err))

    return link
Пример #7
0
def load_json_link_index(out_dir, link):
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
    link = {
        **parse_json_link_index(out_dir),
        **link,
    }
    link.update({
        'history': link.get('history') or {},
    })

    check_link_structure(link)
    return link
Пример #8
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    with open(path, 'w', encoding='utf-8') as f:
        f.write(Template(link_html).substitute({
            **derived_link_info(link),
            # **link['latest'],
        }))

    chmod_file(path)
Пример #9
0
def load_link_index(link_dir, link):
    """check for an existing link archive in the given directory, 
       and load+merge it into the given link dict
    """
    is_new = not os.path.exists(link_dir)
    if is_new:
        os.makedirs(link_dir)
    else:
        link = {
            **parse_json_link_index(link_dir),
            **link,
        }

    check_link_structure(link)
    print_link_status_line(link_dir, link, is_new)

    return link
Пример #10
0
def validate_links(links):
    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(
        links)  # deterministically sort the links based on timstamp, url

    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
        link['title'] = unescape(
            link['title'].strip()) if link['title'] else None
        check_link_structure(link)

    return list(links)
Пример #11
0
def write_html_link_index(out_dir, link):
    check_link_structure(link)
    with open(os.path.join(TEMPLATES_DIR, 'link_index_fancy.html'),
              'r',
              encoding='utf-8') as f:
        link_html = f.read()

    path = os.path.join(out_dir, 'index.html')

    print('      √ index.html')

    with open(path, 'w', encoding='utf-8') as f:
        f.write(
            Template(link_html).substitute({
                **link,
                **link['latest'],
                'title':
                link['title'] or link['url'],
                'type':
                link['type'] or 'website',
                'tags':
                link['tags'] or 'untagged',
                'bookmarked':
                datetime.fromtimestamp(float(
                    link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
                'updated':
                datetime.fromtimestamp(float(
                    link['updated'])).strftime('%Y-%m-%d %H:%M'),
                'bookmarked_ts':
                link['timestamp'],
                'updated_ts':
                link['updated'],
                'archive_org':
                link['latest'].get('archive_org')
                or 'https://web.archive.org/save/{}'.format(link['url']),
                'wget':
                link['latest'].get('wget') or wget_output_path(link),
            }))

    chmod_file(path)
Пример #12
0
def validate_links(links):
    check_links_structure(links)
    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
    links = sorted_links(
        links)  # deterministically sort the links based on timstamp, url

    if not links:
        print('[X] No links found :(')
        raise SystemExit(1)

    for link in links:
        check_link_structure(link)

        link['title'] = unescape(link['title']) if link['title'] else None
        link['latest'] = link.get('latest') or {}

        latest = link['latest']
        if not link['latest'].get('wget'):
            link['latest']['wget'] = wget_output_path(link)

        if not link['latest'].get('pdf'):
            link['latest']['pdf'] = None

        if not link['latest'].get('screenshot'):
            link['latest']['screenshot'] = None

        if not link['latest'].get('dom'):
            link['latest']['dom'] = None

        if not latest.get('favicon'):
            latest['favicon'] = None

        if not link['latest'].get('title'):
            link['latest']['title'] = link['title']

    return list(links)