def attach_all_skills(upto=100): mdb_repos = MicroDB(jsons_dir + 'repos.json', partition_keys=[ 'username', ]) mdb_skills = MicroDB(jsons_dir + 'skills.json', partition_keys=[ 'username', ]) upto = max([upto, len(mdb_repos) - len(mdb_skills)]) sorted_usernames_by_priotity = sort_by_priotity(mdb_repos, mdb_skills) for username in tqdm(sorted_usernames_by_priotity[:upto]): update(username, mdb_skills)
def scrap_repos(): mdb_repos = MicroDB(jsons_dir + 'repos.json', partition_keys=[ 'full_name', ]) mdb_gifs = MicroDB(jsons_dir + 'gifs.json', partition_keys=[ 'full_name', ]) update_required_repos = exact_update_required(mdb_repos, mdb_gifs) args_iterable = [(mdb_gifs, repo) for repo in update_required_repos] scrap_error_ints = [] with ThreadPool(processes=10) as pool: for success_bool, error_place in tqdm( pool.imap_unordered(update_mutlitherading_wrapper, args_iterable), total=len(update_required_repos)): error_int = 1 if error_place == 'chromeless' else 0 scrap_error_ints.insert(0, error_int) scrap_error_ints = scrap_error_ints[:10] # print(scrap_error_ints, error_place) if len(scrap_error_ints) >= 10 and sum(scrap_error_ints) >= 8: raise Exception('chromeless failed too many times')
def save_all_repos(): all_repos = get_all_repos() all_repos = trim_repos(all_repos) all_repos = exclude_no_thanks(all_repos) mdb = MicroDB(jsons_dir + 'repos.json', partition_keys=[ 'full_name', ]) for repo in all_repos: mdb.upsert(dictionary=repo) mdb.save()
def attach_all_geotag(count=100): mdb_repos = MicroDB(jsons_dir+'repos.json', partition_keys=['username', ]) mdb_geotag = MicroDB(jsons_dir+'geotag.json', partition_keys=['username', ]) sorted_usernames_by_priotity = sort_by_priotity(mdb_repos, mdb_geotag) for username in tqdm(sorted_usernames_by_priotity[:count]): update(username, mdb_geotag) i = 0 for d in mdb_repos.all(): geotag = mdb_geotag.get(d) if geotag is None: i += 1 print(i, d)
def exact_yet_stared_succeed_repos(): mdb_repos = MicroDB(jsons_dir + 'repos.json', partition_keys=[ 'full_name', ]) mdb_gifs = MicroDB(jsons_dir + 'gifs.json', partition_keys=[ 'full_name', ]) yet_stared_succeed_repos = [] for d in mdb_repos.all(): gifjson = mdb_gifs.get(d) if gifjson['success'] and d['stargazers_count'] == 0: yet_stared_succeed_repos.append(d) return yet_stared_succeed_repos
def del_wrong_data(): mdb_gifs = MicroDB(jsons_dir + 'gifs.json', partition_keys=[ 'full_name', ]) del_fullnames = [] for d in mdb_gifs.all(): if isinstance(d['success'], str): del_fullnames.append(d) print(d) print(len(del_fullnames)) print(del_fullnames) for del_fullname in del_fullnames: key = mdb_gifs.gen_key(del_fullname) print(key) del mdb_gifs[key] mdb_gifs.save()
def chunks(list_, chunk_len): ''' chunks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 3) >> [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] ''' return list(list_[i:i + chunk_len] for i in range(0, len(list_), chunk_len)) def numberize(string): return int(re.sub(r'\D', '', string)) mdb_repos = MicroDB(jsons_dir + 'repos.json', partition_keys=[ 'full_name', ]) print('mdb_repos', len(mdb_repos)) mdb_geotags = MicroDB(jsons_dir + 'geotag.json', partition_keys=[ 'username', ]) print('mdb_geotags', len(mdb_geotags)) mdb_gifs = MicroDB(jsons_dir + 'gifs.json', partition_keys=[ 'full_name', ]) print('mdb_gifs', len(mdb_gifs)) mdb_skills = MicroDB(jsons_dir + 'skills.json', partition_keys=[ 'username', ]) print('mdb_skills', len(mdb_skills)) merged_db = []
def test(): if os.path.exists(filename): os.remove(filename) mdb = MicroDB(filename, testpartition_keys) mdb.erase_all() mdb = MicroDB(filename, testpartition_keys) for d in test_data: mdb.upsert(d) mdb.save() mdb.pprint_all() mdb2 = MicroDB(filename, testpartition_keys) for d in mdb2.all(): print(d) mdb2.save_as_grid() mdb3 = MicroDB(filename, testpartition_keys) for d in mdb3.all(): print(d) mdb4 = MicroDB(filename, testpartition_keys) mdb4.upsert({ 'job': 'study', 'name': 'Bob', 'status': 'undone', 'extra-info': 'hogehoge' }) try: mdb4.save_as_grid() except Exception as e: print(e) mdb4.save() mdb4