def main(): """ Main program """ for signal_type in [SIGTERM, SIGABRT]: signal(signal_type, clean_exit) parser = OptionParser("usage: %prog [options] accountName password gedcomFile") parser.add_option("-c", "--count", dest="count", default="999999", help=SUPPRESS_HELP, metavar="NUMBER") parser.add_option("-i", "--ignore", action="store_true", dest="ignore", default=False, help="Ignore previously identified unavailable APID entries") parser.add_option("-l", "--logfile", dest="logfile", default="ancestry_extract.log", help="Optional log file location", metavar="FILE") parser.add_option("-o", "--output", dest="output", default=".", help="Output directory", metavar="DIR") parser.add_option("-r", "--resume", action="store_true", dest="resume", default=False, help="Resume if prior state found") parser.add_option("-s", "--screenshot", action="store_true", dest="screenshot", default=False, help="Generate source record screenshots") parser.add_option("-u", "--url", dest="ancestry", default="https://www.ancestry.com", help="Override default https://www.ancestry.com URL") (options, args) = parser.parse_args() if len(args) != 3: print('Account name, password, and gedcom file are required arguments') sys.exit(1) if not os.path.isfile(args[2]): print('Gedcom file not found') sys.exit(1) with open(args[2], "r") as gedcom: gedcom_data = gedcom.read() if '1 SOUR Ancestry.com Family Trees' not in gedcom_data \ or '2 CORP Ancestry.com' not in gedcom_data: print('Gedcom file does not appear to be from Ancestry.com') sys.exit(1) options.username = args[0] options.password = args[1] options.gedcom = args[2] logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-8s %(levelname)-8s %(message)s', filename=options.logfile, filemode='a') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) for check_dir in ['/media/dbid', '/media/apid', '/metadata/guid', '/metadata/apid', '/metadata/dbid']: if not os.path.isdir(options.output + check_dir): os.makedirs(options.output + check_dir) gedcom_queue = Queue() gedcom_process = Process(target=load_gedcom, args=(gedcom_queue, gedcom_data)) gedcom_process.start() cache_queue = Queue() cache_process = Process(target=load_tables, args=(cache_queue, options.output)) cache_process.start() logging.info('Launching browser') firefox_profile = FirefoxProfile() firefox_profile.set_preference("browser.startup.homepage", "about:blank") firefox_profile.set_preference("browser.download.folderList", 2) firefox_profile.set_preference("browser.download.panel.shown", False) firefox_profile.set_preference("browser.download.manager.showWhenStarting", False) firefox_profile.set_preference("browser.download.dir", options.output) firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") firefox_profile.set_preference("places.history.enabled", False) firefox_options = Options() firefox_options.headless = True session = Firefox(options=firefox_options, firefox_profile=firefox_profile) atexit.register(session_cleanup, session) session.implicitly_wait(15) session.fullscreen_window() session.options = options login(session) result = cache_queue.get() session.checkpoint = result['checkpoint'] session.tree_id = result['tree_id'] session.tree_name = result['tree_name'] session.unavailable = [] if options.resume or options.ignore: session.unavailable = result['unavailable'] session.hash_map = result['hash_map'] session.images = result['image_cache'] cache_process.join() result = gedcom_queue.get() people = result['people'] people_total = len(people) family_total = result['families'] apid_total = result['apid_total'] apid_unique = result['apid_unique'] guid_total = result['guid_total'] guid_unique = result['guid_unique'] gedcom_process.join() logging.info('Found %d people and %d families to process', people_total, family_total) logging.info('Found %d unique and %d total ancestry media items to process', apid_unique, apid_total) logging.info('Found %d unique and %d total user media items to process', guid_unique, guid_total) print_flag = False session.line_number = 0 success = unavailable = duplicate = skip = timeouts = total = count = 0 person_number = family_number = 0 apid_number = guid_number = 0 person = husband = wife = '' url_note = '' logging.info('Starting second pass processing Gedcom media items') for line in gedcom_data.split('\n'): session.line_number = session.line_number + 1 if options.resume and session.line_number < session.checkpoint: continue options.resume = False if len(line) < 5: continue tag = line.split(' ')[1] if tag == 'SOUR': if session.line_number > session.checkpoint: session.checkpoint = session.line_number continue if '@P' in tag: person_number = person_number + 1 husband = wife = '' person = people[tag] print_flag = False continue if '@F' in tag: family_number = family_number + 1 husband = wife = person = '' print_flag = False continue if tag == 'HUSB': husband = people[line[7:]] continue if tag == 'WIFE': wife = people[line[7:]] continue if tag == 'NOTE': if 'http' in line: url_note = line[7:] continue if tag in ['FILE', '_APID']: total = total + 1 if not print_flag: if session.line_number > session.checkpoint: if person: logging.info('Processing records for person %s (%d of %d)', person, person_number, people_total) else: who = join = '' if husband != '': who = husband join = ' and ' if wife != '': who = who + join + wife logging.info('Processing records for family of %s (%d of %d)', who, family_number, family_total) print_flag = True if ' FILE ' in line and 'f=image&guid=' in line: guid_number = guid_number + 1 logging.debug('User media item %d of %d with %d unique', guid_number, guid_total, guid_unique) result = user_media(session, line) if ' _APID ' in line: process_apid = True if options.ignore: apid = line.split(' ').pop(2).strip() if apid in session.unavailable: process_apid = False result = 'unavailable' if process_apid: apid_number = apid_number + 1 if '::0' not in line: logging.debug('Ancestry media item %d of %d with %d unique', apid_number, apid_total, apid_unique) result = ancestry_media(session, line) if result == 'success': count = count + 1 success = success + 1 elif result == 'duplicate': duplicate = duplicate + 1 elif result == 'unavailable': if person: logging.info('Unavailable item for %s', person) else: logging.info('Unavailable item for %s / %s', husband, wife) unavailable = unavailable + 1 elif result == 'timeout': timeouts = timeouts + 1 elif result == 'skip': skip = skip + 1 if count == int(options.count): logging.info('Reached limit of %d records processed', count) break logging.info('Total overall records: %d', total) logging.info('Total processed records: %d', success) logging.info('Total duplicate records: %d', duplicate) logging.info('Total unavailable records: %d', unavailable) logging.info('Total skipped due to unavailable: %d', skip) logging.info('Total skipped due to timeouts: %d', timeouts)
def main(): """ Main program """ for signal_type in [SIGTERM, SIGABRT]: signal(signal_type, clean_exit) parser = argparse.ArgumentParser() parser.add_argument("-a", "--account", help="Account name") parser.add_argument("-c", "--config", help="Configuration file") parser.add_argument( "-C", "--citations", default=True, action="store_true", help="Save source images for citations", ) parser.add_argument( "-g", "--gedcom", help="Gedcom file", ) parser.add_argument( "-i", "--ignore", default=False, action="store_true", help="Ignore previously unavailable APID entries", ) parser.add_argument( "-M", "--media", default=True, action="store_true", help="Save user media images", ) parser.add_argument( "-N", "--newspapers", default=False, action="store_true", help="Save clipped newspaper images", ) parser.add_argument("-o", "--output", help="Root of output directory structure") parser.add_argument("-p", "--password", help="Password") parser.add_argument( "-r", "--resume", default=False, action="store_true", help="Resume if prior state found", ) parser.add_argument( "-S", "--screenshots", default=False, action="store_true", help="Save source citation screenshots", ) parser.add_argument( "-u", "--url", dest="ancestry", default="https://www.ancestry.com", help="Override default https://www.ancestry.com", ) args = parser.parse_args() if not args.account or not args.password or not args.gedcom: if not args.config: args.config = "ancestry_extract.toml" if args.config: if os.path.isfile(args.config): with open(args.config, "r") as config_file: config_data = toml.load(config_file) for key in config_data: setattr(args, key, config_data[key]) if not args.account or not args.password or not args.gedcom: print("Account name, password, and gedcom file are required arguments") sys.exit(1) if not os.path.isfile(args.gedcom): print("Gedcom file not found") sys.exit(1) with open(args.gedcom, "r") as gedcom: gedcom_data = gedcom.read() if ("1 SOUR Ancestry.com Family Trees" not in gedcom_data or "2 CORP Ancestry.com" not in gedcom_data): print("Gedcom file does not appear to be from Ancestry.com") sys.exit(1) for check_dir in [ "/logs", "/media/dbid", "/media/apid", "/metadata/guid", "/metadata/apid", "/metadata/dbid", ]: if not os.path.isdir(args.output + check_dir): os.makedirs(args.output + check_dir) log_file = (args.output + "/logs/" + pendulum.now().format("YYYY-MM-DD-HH-MM") + "-ancestry-extract.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(name)-8s %(levelname)-8s %(message)s", filename=log_file, filemode="a", ) console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s %(message)s") console.setFormatter(formatter) logging.getLogger("").addHandler(console) if args.config: logging.info("Config File: " + args.config) logging.info("Gedcom File: " + args.gedcom) logging.info("Output Tree: " + args.output) logging.info("Save Citation Images: " + str(args.citations)) logging.info("Save Citation Screenshots: " + str(args.screenshots)) logging.info("Save User Media: " + str(args.media)) logging.info("Save News Clippings: " + str(args.newspapers)) gedcom_queue = Queue() gedcom_process = Process(target=load_gedcom, args=(gedcom_queue, gedcom_data)) gedcom_process.start() cache_queue = Queue() cache_process = Process(target=load_tables, args=(cache_queue, args.output)) cache_process.start() logging.info("Launching browser") firefox_profile = FirefoxProfile() firefox_profile.set_preference("browser.startup.homepage", "about:blank") firefox_profile.set_preference("browser.download.folderList", 2) firefox_profile.set_preference("browser.download.panel.shown", False) firefox_profile.set_preference("browser.download.manager.showWhenStarting", False) firefox_profile.set_preference("browser.download.dir", args.output) firefox_profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") firefox_profile.set_preference("places.history.enabled", False) firefox_options = Options() firefox_options.headless = True session = Firefox(options=firefox_options, firefox_profile=firefox_profile) atexit.register(session_cleanup, session) session.implicitly_wait(15) session.fullscreen_window() session.options = args login(session) result = cache_queue.get() session.checkpoint = result["checkpoint"] session.tree_id = result["tree_id"] session.tree_name = result["tree_name"] session.unavailable = [] if args.resume or args.ignore: session.unavailable = result["unavailable"] session.hash_map = result["hash_map"] session.images = result["image_cache"] cache_process.join() result = gedcom_queue.get() people = result["people"] people_total = len(people) family_total = result["families"] apid_total = result["apid_total"] apid_unique = result["apid_unique"] guid_total = result["guid_total"] guid_unique = result["guid_unique"] gedcom_process.join() logging.info("Found %d people and %d families to process", people_total, family_total) logging.info( "Found %d unique and %d total ancestry citations to process", apid_unique, apid_total, ) logging.info( "Found %d unique and %d total user media items to process", guid_unique, guid_total, ) print_flag = False session.line_number = 0 success = unavailable = duplicate = skip = timeouts = total = count = 0 person_number = family_number = 0 apid_number = guid_number = 0 person = husband = wife = "" url_note = "" logging.info("Starting second pass Gedcom processing") for line in gedcom_data.split("\n"): session.line_number = session.line_number + 1 if args.resume and session.line_number < session.checkpoint: continue args.resume = False if len(line) < 5: continue if line[0] == 1: # reset the url note for new records url_note = "" tag = line.split(" ")[1] if tag == "SOUR": if session.line_number > session.checkpoint: session.checkpoint = session.line_number continue if "@P" in tag: person_number = person_number + 1 husband = wife = "" person = people[tag] print_flag = False continue if "@F" in tag: family_number = family_number + 1 husband = wife = person = "" print_flag = False continue if tag == "HUSB": husband = people[line[7:]] continue if tag == "WIFE": wife = people[line[7:]] continue if tag == "NOTE": if "http" in line: url_note = line[7:] continue if tag in ["FILE", "_APID"]: total = total + 1 if not print_flag: if session.line_number > session.checkpoint: if person: logging.info( "Processing records for person %s (%d of %d)", person, person_number, people_total, ) else: who = join = "" if husband != "": who = husband join = " and " if wife != "": who = who + join + wife logging.info( "Processing records for family of %s (%d of %d)", who, family_number, family_total, ) print_flag = True if args.media and " FILE " in line and "f=image&guid=" in line: guid_number = guid_number + 1 logging.debug( "User media item %d of %d with %d unique", guid_number, guid_total, guid_unique, ) result = get_user_media(session, line, url_note) url_note = "" if args.citations and " _APID " in line: process_apid = True if args.ignore: apid = line.split(" ").pop(2).strip() if apid in session.unavailable: process_apid = False result = "unavailable" if process_apid: apid_number = apid_number + 1 if "::0" not in line: logging.debug( "Source citation media item %d of %d with %d unique", apid_number, apid_total, apid_unique, ) result = get_citation_media(session, line) if result == "success": count = count + 1 success = success + 1 elif result == "duplicate": duplicate = duplicate + 1 elif result == "unavailable": if person: logging.info("Unavailable item for %s", person) else: logging.info("Unavailable item for %s / %s", husband, wife) unavailable = unavailable + 1 elif result == "timeout": timeouts = timeouts + 1 elif result == "skip": skip = skip + 1 logging.info("Total overall records: %d", total) logging.info("Total processed records: %d", success) logging.info("Total duplicate records: %d", duplicate) logging.info("Total unavailable records: %d", unavailable) logging.info("Total skipped due to unavailable: %d", skip) logging.info("Total skipped due to timeouts: %d", timeouts)