def fetch_missing_posts(filename, max_conf=None, min_conf=1, post_ids=None): with open(filename, 'r', encoding='utf-8') as file: confessions = post.make_id_map( [post.deserialize(item) for item in json.loads(file.read())]) if post_ids: with open(post_ids, 'r', encoding='utf-8') as file: post_id_list = json.loads(file.read()) # Delete confession with key None (1st arg) and if it doesn't exist, return None (2nd arg) confessions.pop(None, None) if max_conf == None: max_conf = max(confessions.keys()) temp_file = open('./output/_posts_less_missing.json', 'w', encoding='utf-8') since_last = 0 for i in range(max_conf, min_conf - 1, -1): if i not in confessions: if post_ids == None: found = post.make_id_map(parse_search(str(i))) confessions.update(found) print('Was missing confession #%d; found %s' % (i, ' '.join(map(str, found.keys())) or '[presumably deleted]')) else: conf_num = str(i) post_id = post_id_list.get(conf_num) if post_id: confessions[conf_num] = scraper.fetch_post(post_id) print('Found missing confession %d' % i) else: print('Could not find missing confession %d' % i) # Save every 20 iterations since_last += 1 if since_last > 20: temp_file.write( json.dumps( [conf.serialize() for conf in confessions.values()], indent=2)) since_last = 0 print('Saved') temp_file.close() filename = './output/posts_%s_less_missing.json' % datetime.now().strftime( '%Y-%m-%d_%H.%M.%S') with open(filename, 'w', encoding='utf-8') as file: file.write( json.dumps([conf.serialize() for conf in confessions.values()], indent=2)) print('Fetched missing posts in %s' % filename) return filename
def fetch_missing_from_json(already_found, to_fetch): with open(already_found, 'r', encoding='utf-8') as file: confessions = post.make_id_map( [post.deserialize(item) for item in json.loads(file.read())]) with open(to_fetch, 'r', encoding='utf-8') as file: post_id_list = json.loads(file.read()) temp_file = open('./output/_posts_fetch_missing.json', 'w', encoding='utf-8') since_last = 0 for conf_num, post_id in post_id_list.items(): if conf_num == 'null': continue if conf_num.startswith('worrying') or int(conf_num) not in confessions: try: fetched = scraper.fetch_post(post_id) except: print( 'There was a problem getting #%s. Retrying in five seconds.' % conf_num) temp_file.write( json.dumps( [conf.serialize() for conf in confessions.values()], indent=2)) since_last = 0 print('Saved') time.sleep(5) fetched = scraper.fetch_post(post_id) real_num = fetched.conf_num() if str(real_num) != conf_num: print('Was fetching confession %s but got #%s?? (post ID %s)' % (conf_num, real_num, post_id)) confessions[real_num] = fetched print('Found missing confession %s' % real_num) # Save every 20 iterations since_last += 1 if since_last > 20: temp_file.write( json.dumps( [conf.serialize() for conf in confessions.values()], indent=2)) since_last = 0 print('Saved') temp_file.close() filename = './output/fetched_missing_posts_%s.json' % datetime.now( ).strftime('%Y-%m-%d_%H.%M.%S') with open(filename, 'w', encoding='utf-8') as file: file.write( json.dumps([conf.serialize() for conf in confessions.values()], indent=2)) print('Fetched missing posts in %s' % filename) return filename
def create_missing_visual(found=None, searched=None, spreadsheet=None): if found: with open(found, 'r', encoding='utf-8') as file: confessions = post.make_id_map( [post.deserialize(item) for item in json.loads(file.read())]) max_conf = max(key for key in confessions.keys() if isinstance(key, int)) else: confessions = {} max_conf = 1 if searched: with open(searched, 'r', encoding='utf-8') as file: has_post_id = set( int(num) for num in json.loads(file.read()).keys() if num != 'null' and not num.startswith('worrying')) else: has_post_id = set() if spreadsheet: with open(spreadsheet, 'r', encoding='utf-8') as file: truly_missing = set(i + 1 for i, line in enumerate(file) if line.strip() == '') else: truly_missing = set() data = [colours['empty']] * max_conf for confession in range(1, max_conf): num = confession colour = 'found' if num in confessions else 'has_post_id' if num in has_post_id else 'truly_missing' if num in truly_missing else 'missing' thousands = math.floor(num / THOUSAND) data[confession] = colours[colour if thousands % 2 == 0 else colour + '_alt'] # https://stackoverflow.com/a/435215 image = Image.new('RGB', (WIDTH, math.ceil(max_conf / WIDTH)), colours['empty']) image.putdata(data) save_target = './output/missing_%s.png' % datetime.now().strftime( '%Y-%m-%d_%H.%M.%S') image.save(save_target) print('Saved missing confession visualization in %s' % save_target) return save_target
def get_missing_numbers(filename, max_conf=None, min_conf=1): with open(filename, 'r', encoding='utf-8') as file: confessions = post.make_id_map( [post.deserialize(item) for item in json.loads(file.read())]) if max_conf == None: max_conf = max(confessions.keys()) missing = '\n'.join([ str(num) for num in range(min_conf, max_conf + 1) if num not in confessions ]) filename = './output/missing_nums_%s.txt' % datetime.now().strftime( '%Y-%m-%d_%H.%M.%S') with open(filename, 'w', encoding='utf-8') as file: file.write(missing) print('Missing confession numbers in %s' % filename) return filename
def load_confessions(filename): with open(filename, 'r', encoding='utf-8') as file: return [deserialize(item) for item in json.loads(file.read())]