예제 #1
0
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
    """get new links from file and optionally append them to links in existing archive"""
    all_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        all_links = validate_links(raw_links)

    # merge existing links in archive_path and new links
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        all_links = validate_links(existing_links + all_links)

    num_new_links = len(all_links) - len(existing_links)
    if SHOW_PROGRESS:
        print()
    print('    > Adding {} new links to index from {} (parsed as {} format)'.
          format(
              num_new_links,
              pretty_path(import_path),
              parser_name,
          ))

    if only_new:
        return new_links(all_links, existing_links)

    return all_links
예제 #2
0
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False):
    """get new links from file and optionally append them to links in existing archive"""
    all_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        all_links = validate_links(raw_links)

    # merge existing links in archive_path and new links
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        all_links = validate_links(existing_links + all_links)

    num_new_links = len(all_links) - len(existing_links)
    if num_new_links and not only_new:
        print('{green}[+] [{}] Adding {} new links to index from {} ({} format){reset}'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            num_new_links,
            pretty_path(import_path),
            parser_name,
            **ANSI,
        ))
    # else:
    #     print('[*] [{}] No new links added to {}/index.json{}'.format(
    #         datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    #         archive_path,
    #         ' from {}'.format(import_path) if import_path else '',
    #         **ANSI,
    #     ))

    if only_new:
        return new_links(all_links, existing_links)

    return all_links
예제 #3
0
파일: archive.py 프로젝트: yyniu/ArchiveBox
def load_links(archive_path=OUTPUT_DIR, import_path=None):
    """get new links from file and optionally append them to links in existing archive"""

    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        check_links_structure(existing_links)

    new_links = []
    if import_path:
        # parse and validate the import file
        raw_links, parser_name = parse_links(import_path)
        new_links = validate_links(raw_links)
        check_links_structure(new_links)

    # merge existing links in archive_path and new links
    all_links = validate_links(existing_links + new_links)
    check_links_structure(all_links)
    num_new_links = len(all_links) - len(existing_links)

    if import_path and parser_name:
        print(
            '    > Adding {} new links to index (parsed import as {})'.format(
                num_new_links,
                parser_name,
            ))

    return all_links, new_links
예제 #4
0
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
    if not exists(join(OUTPUT_DIR, 'index.json')):
        exit('index.json is missing; nothing to do')

    compiled = [re.compile(r) for r in regexes]
    links = parse_json_links_index(OUTPUT_DIR)
    filtered = []
    remaining = []

    for l in links:
        url = l['url']
        for r in compiled:
            if r.search(url):
                filtered.append((l, r))
                break
        else:
            remaining.append(l)

    if not filtered:
        exit('Search did not match any entries.')

    print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))

    for link, regex in filtered:
        url = link['url']
        print(' {url} via {regex}'.format(url=url, regex=regex.pattern))

    if not proceed:
        answer = input('Remove {} entries from index? [y/n] '.format(
            len(filtered)))
        proceed = answer.strip().lower() in ('y', 'yes')

    if not proceed:
        exit('Aborted')

    write_json_links_index(OUTPUT_DIR, remaining)
    write_html_links_index(OUTPUT_DIR, remaining)

    if delete:
        for link, _ in filtered:
            data_dir = join(ARCHIVE_DIR, link['timestamp'])
            if exists(data_dir):
                rmtree(data_dir)
예제 #5
0
def get_links(new_links_file_path, archive_path=HTML_FOLDER):
    """get new links from file and optionally append them to links in existing archive"""
    # parse and validate the new_links_file
    raw_links = parse_links(new_links_file_path)
    valid_links = validate_links(raw_links)

    # merge existing links in archive_path and new links
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
        valid_links = validate_links(existing_links + valid_links)

    num_new_links = len(valid_links) - len(existing_links)
    print('[*] [{}] Adding {} new links from {} to index'.format(
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        num_new_links,
        new_links_file_path,
    ))

    return valid_links