def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) all_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) all_links = validate_links(existing_links + all_links) num_new_links = len(all_links) - len(existing_links) if SHOW_PROGRESS: print() print(' > Adding {} new links to index from {} (parsed as {} format)'. format( num_new_links, pretty_path(import_path), parser_name, )) if only_new: return new_links(all_links, existing_links) return all_links
def merge_links(archive_path=OUTPUT_DIR, import_path=None, only_new=False): """get new links from file and optionally append them to links in existing archive""" all_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) all_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) all_links = validate_links(existing_links + all_links) num_new_links = len(all_links) - len(existing_links) if num_new_links and not only_new: print('{green}[+] [{}] Adding {} new links to index from {} ({} format){reset}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), num_new_links, pretty_path(import_path), parser_name, **ANSI, )) # else: # print('[*] [{}] No new links added to {}/index.json{}'.format( # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # archive_path, # ' from {}'.format(import_path) if import_path else '', # **ANSI, # )) if only_new: return new_links(all_links, existing_links) return all_links
def load_links(archive_path=OUTPUT_DIR, import_path=None): """get new links from file and optionally append them to links in existing archive""" existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) check_links_structure(existing_links) new_links = [] if import_path: # parse and validate the import file raw_links, parser_name = parse_links(import_path) new_links = validate_links(raw_links) check_links_structure(new_links) # merge existing links in archive_path and new links all_links = validate_links(existing_links + new_links) check_links_structure(all_links) num_new_links = len(all_links) - len(existing_links) if import_path and parser_name: print( ' > Adding {} new links to index (parsed import as {})'.format( num_new_links, parser_name, )) return all_links, new_links
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: if not exists(join(OUTPUT_DIR, 'index.json')): exit('index.json is missing; nothing to do') compiled = [re.compile(r) for r in regexes] links = parse_json_links_index(OUTPUT_DIR) filtered = [] remaining = [] for l in links: url = l['url'] for r in compiled: if r.search(url): filtered.append((l, r)) break else: remaining.append(l) if not filtered: exit('Search did not match any entries.') print('Filtered out {}/{} urls:'.format(len(filtered), len(links))) for link, regex in filtered: url = link['url'] print(' {url} via {regex}'.format(url=url, regex=regex.pattern)) if not proceed: answer = input('Remove {} entries from index? [y/n] '.format( len(filtered))) proceed = answer.strip().lower() in ('y', 'yes') if not proceed: exit('Aborted') write_json_links_index(OUTPUT_DIR, remaining) write_html_links_index(OUTPUT_DIR, remaining) if delete: for link, _ in filtered: data_dir = join(ARCHIVE_DIR, link['timestamp']) if exists(data_dir): rmtree(data_dir)
def get_links(new_links_file_path, archive_path=HTML_FOLDER): """get new links from file and optionally append them to links in existing archive""" # parse and validate the new_links_file raw_links = parse_links(new_links_file_path) valid_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) valid_links = validate_links(existing_links + valid_links) num_new_links = len(valid_links) - len(existing_links) print('[*] [{}] Adding {} new links from {} to index'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), num_new_links, new_links_file_path, )) return valid_links