def download(course, item): """ Download announcement JSON. :param course: A Course object. :param item: { "close_time": 2147483647, "user_id": 1069689, "open_time": 1411654451, "title": "Coursera", "deleted": 0, "email_announcements": "email_sent", "section_id": "14", "order": "6", "item_type": "announcement", "__type": "announcement", "published": 1, "item_id": "39", "message": "Hello, everyone.", "uid": "announcement39", "id": 39, "icon": "" } :return: None. """ path = '{}/announcement/{}.json' path = path.format(course.get_folder(), item['item_id']) util.make_folder(path, True) util.write_json(path, item) content = util.read_file(path) content = util.remove_coursera_bad_formats(content) util.write_file(path, content)
def predict(num=1): # num表示抽取的答案句数目 system('.\svm_rank_windows\svm_rank_classify.exe %s %s %s' % (test_feature_path, model_path, test_predict_path)) with open(test_feature_path, 'r', encoding='utf-8') as f1, open(test_predict_path, 'r', encoding='utf-8') as f2: labels = {} for line1, line2 in zip(f1, f2): if len(line1) == 1: break qid = int(line1.split()[1].split(':')[1]) if qid not in labels: labels[qid] = [] labels[qid].append((float(line2.strip()), len(labels[qid]))) seg_passages, res_lst = load_seg_passages(), read_json(test_path) for item in res_lst: # 遍历文件中的每一行query信息 qid, pid, q_words = item['qid'], item['pid'], item['question'] rank_lst, seg_passage = sorted( labels[qid], key=lambda val: val[0], reverse=True), seg_passages[str(pid)] item['answer_sentence'] = [ seg_passage[rank[1]] for rank in rank_lst[:num] ] # 抽取答案句 write_json(test_ans_path, res_lst)
def make_examples_simple(data_dir, n_users, negative_examples_per_user=10): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) new_edges = defaultdict(dict) with open(data_dir + 'new_edges.txt') as f: for line in f: u, b = map(int, line.split()) new_edges[u][b] = 1 businesses = map(int, util.load_json(data_dir + 'business.json').keys()) examples = defaultdict(dict) users = random.sample([NI.GetId() for NI in G.Nodes()], n_users) for u in users: examples[u] = new_edges[u] for i in range(negative_examples_per_user): b = random.choice(businesses) examples[u][b] = 0 p, n = 0, 0 for u in examples: for b in examples[u]: p += examples[u][b] n += 1 - examples[u][b] print "Positive:", p print "Negative:", n print "Data skew:", p / float(p + n) print "Sampling rate:", negative_examples_per_user / float(len(businesses)) print "Writing examples..." util.write_json(examples, data_dir + 'examples_simple.json')
def write_pr_matches(outfile): data = {} entries = get_names_from_elastic() if entries: for elem in entries: sdn_id = elem['_id'] name = elem['_source']['primary_display_name'] data[sdn_id] = [] result = query_pr_content(name) for entry in result['hits']['hits']: pr_elem = { 'pr_id': entry['_id'], 'link': entry['_source']['link'], 'date': entry['_source']['date'], 'title': entry['_source']['title'], } data[sdn_id].append(pr_elem) other_dates = query_pr_date(entry['_source']['date']) for date_entry in other_dates['hits']['hits']: if (date_entry['_id'] != entry['_id'] and date_entry['_source']['link'] in entry['_source']['related']): new_elem = { 'pr_id': date_entry['_id'], 'link': date_entry['_source']['link'], 'date': date_entry['_source']['date'], 'title': date_entry['_source']['title'], } data[sdn_id].append(new_elem) util.write_json(outfile, data)
def main(): train_messages, train_labels = util.load_spam_dataset( '../data/ds6_train.tsv') val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv') test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv') dictionary = create_dictionary(train_messages) util.write_json('./output/p06_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('./output/p06_top_indicative_words', top_5_words) """
def main(): train_tweets, val_tweets, test_tweets, train_labels, val_labels, test_labels = load_dataset("final_data/compiled_data.csv") dictionary = create_dictionary(train_tweets) util.write_json('./output/dictionary', dictionary) train_matrix = transform_text(train_tweets, dictionary) val_matrix = transform_text(val_tweets, dictionary) test_matrix = transform_text(test_tweets, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print("naive_bayes_results: ") unique, counts = np.unique(naive_bayes_predictions, return_counts=True) print(dict(zip(unique, counts))) print("test_labels: " ) unique, counts = np.unique(test_labels, return_counts=True) print(dict(zip(unique, counts))) print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('./output/p06_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))
def run(): results = {} index = get_index() datetimes = list(index.keys()) for date in datetimes: hn_html_soup = BeautifulSoup(open(index[date][HN_KEY], "r")) dt_html_soup = BeautifulSoup(open(index[date][DT_KEY], "r")) dt_html_soup.find_all() hn_titles = extract_titles(hn_html_soup) hn_subtexts = extract_subtexts(hn_html_soup) dt_titles = extract_titles(dt_html_soup) dt_subtexts = extract_subtexts(dt_html_soup) results[date] = { "HackerNews": { "links": extract_links(hn_titles), "scores": extract_scores(hn_subtexts), "users": extract_users(hn_subtexts), "post_ages": extract_post_ages(hn_subtexts), "nr_of_comments": extract_nr_of_comments(hn_subtexts) }, "DataTau": { "links": extract_links(dt_titles), "scores": extract_scores(dt_subtexts), "users": extract_users(dt_subtexts), "post_ages": extract_post_ages(dt_subtexts), "nr_of_comments": extract_nr_of_comments(dt_subtexts) } } write_json(results, "data/aggregation_results.json")
def post(self): """POST handler for gallery albums. URL pattern: /albums POST data must contain album metadata: 'name'. Returns 201 CREATED with JSON data structure describing new album. Returns Content-type: application/json. Also returns Location header pointing to API URL for album details. Include 'wrapjson' parameter in POST to wrap returned JSON in a <textarea>. This also changes the returned Content-type to text/html. If request is poorly formatted returns 400 BAD REQUEST. Returns 401 UNAUTHORIZED to all calls if authorization fails. """ try: data = dict(((str(k), v) for k, v in self.request.POST.items())) album = Album(album_id=config.ALBUM_ID_GENERATOR(), **data) except: data = {} self.error(400) else: if not config.DEMO_MODE: album.put() data = album.to_dict() self.response.headers['Location'] = data['url'] self.response.set_status(201) write_json(self, data, wrapjson='wrapjson' in self.request.POST)
def export_sequence_json(T, path, prefix): from Bio import SeqIO plain_export = 0.99 indent = None elems = {'root': {}} for node in T.find_clades(): elems[node.clade] = {} for gene, aln_fname in get_genes_and_alignments(path, tree=True): seqs = {} for seq in SeqIO.parse(aln_fname, 'fasta'): seqs[seq.name] = seq root_seq = seqs[T.root.name] elems['root'][gene] = "".join(root_seq) for node in T.find_clades(): nseq = seqs[node.name] if hasattr(node, "clade"): differences = { pos: state for pos, ( state, ancstate) in enumerate(zip(nseq, elems['root'][gene])) if state != ancstate } if len(differences) <= plain_export * len(seq): elems[node.clade][gene] = differences else: elems[node.clade][gene] = seq fname = sequence_json(path, prefix) write_json(elems, fname, indent=indent)
def calc_vsm_perform(similarity_func=calc_inner_product): if similarity_func.__name__ not in [ calc_cosine.__name__, calc_inner_product.__name__, calc_jaccard.__name__ ]: print('错误的输入相似度计算函数...') return print('正在加载训练集的预处理文件...') if file_exists(preprocess_path): res_lst = read_json(preprocess_path) # 加载训练集初步处理后的文件 else: res_lst = read_json(train_path) # 加载训练集源文件 for question in res_lst: question['question'] = seg_line(question['question']) write_json(preprocess_path, res_lst) print('正在计算相似度...') res = {} for item in res_lst: q_words, pid = {}, item['pid'] for word in item['question']: q_words[word] = q_words.get(word, 0) + 1 query_dic = { word: idf.get(word, 0) * (1 + log(tf, 10)) for word, tf in q_words.items() } pred_pid = similarity_func(query_dic)[0][0] res[item['qid']] = int(pred_pid) == pid print('进度: %.2f%%' % (len(res) / len(res_lst) * 100)) return len(list(filter(lambda val: res[val], res))) / len(res)
def comment(repo_dir, report): good = [] # All commits on all branches bad = [] # No commits ugly = [] # Partial uplift good, bad, ugly = classify_gbu(report) failed_bugs = [] def x(bug_id): del report[bug_id] util.write_json(uplift.uplift_report_file, report) for i, j in (good, good_bug_comment), (bad, bad_bug_comment), (ugly, ugly_bug_comment): for bug_id in i: print "Commenting on bug %s" % bug_id try: j(repo_dir, bug_id, report[bug_id]) x(bug_id) except FailedToComment: failed_bugs.append(bug_id) if len(failed_bugs) > 0: filename = os.path.abspath('failed_comments_%s.json' % util.time_str()) print "The following bugs had commenting failures" print util.e_join(failed_bugs) print "Creating a file to use with the 'uplift comments' file to try just these." print "Fix the issue then run: uplift comments %s" % filename util.write_json(filename, report)
def run(): npcs = {} npc_pages = api.query_category("Monsters") for name, page in npc_pages.items(): if name.startswith("Category:"): continue try: code = mw.parse(page, skip_style_tags=True) for (vid, version) in util.each_version("Infobox Monster", code): doc = util.get_doc_for_id_string(name + str(vid), version, npcs) if doc == None: continue util.copy("name", doc, version, lambda x: x) for key in ["hitpoints", "combat"]: try: util.copy(key, doc, version, lambda x: int(x)) except ValueError: print("NPC {} has an non integer {}".format(name, key)) except (KeyboardInterrupt, SystemExit): raise except: print("NPC {} failed:".format(name)) traceback.print_exc() for npcId in copy.copy(npcs): npc = npcs[npcId] if not 'combat' in npc: del npcs[npcId] util.write_json("npcs.json", "npcs.min.json", npcs)
def download_thread(course, threads_folder, thread_id, page=1, post_id=None): """ Download a thread. """ # Download 1st page url = '{}/api/forum/threads/{}'.format(course.get_url(), thread_id) if post_id: url = '{}?post_id={}&position=after'.format(url, post_id) path = '{}/{}/{}.json'.format(threads_folder, thread_id, page) util.download(url, path, course.get_cookie_file()) thread = util.read_json(path) download_images(course, threads_folder, thread) util.write_json(path, thread) # Download rest pages page = thread['start_page'] num_page = thread['num_pages'] if page < num_page: page += 1 print 'thread page {}/{}'.format(page, num_page) post_id = get_next_post_id(thread['posts']) if post_id: download_thread(course, threads_folder, thread_id, page, post_id)
def build_uplift_requirements(repo_dir): if os.path.exists(requirements_file) and util.ask_yn("Found existing requirements. Should they be used?"): bug_info = util.read_json(requirements_file) else: bug_info = {} enabled_branches = c.read_value('repository.enabled_branches') all_queries = c.read_value('queries') queries = [] for branch in enabled_branches: queries.extend(all_queries[branch]) bugs = [x for x in find_bugs(queries) if not is_skipable(x)] print "Fetching bug data" for bug_id in bugs: if is_skipable(bug_id): continue bug = bzapi.fetch_complete_bug(bug_id) print "+", needed_on = branch_logic.needed_on_branches(bug) if len(needed_on) == 0: continue b = bug_info[bug_id] = {} b['needed_on'] = needed_on b['already_fixed_on'] = branch_logic.fixed_on_branches(bug) b['summary'] = bug['summary'] print "\nFinished fetching bug data" util.write_json(requirements_file, bug_info) return bug_info
def find_threads(course, forum_folder, forum_id): """ Find all threads in current forum. Note: forum 0 has every thread! """ # download the 1st page of given forum query = 'sort=firstposted&page=1' url = '{}/api/forum/forums/{}/threads?{}' url = url.format(course.get_url(), forum_id, query) path = forum_folder + '/temp.json' util.download(url, path, course.get_cookie_file()) # download a huge page with all threads forum = util.read_json(path) num_threads = forum['total_threads'] url += '&page_size={}'.format(num_threads) util.download(url, path, course.get_cookie_file()) # add each thread's id to forum info threads = util.read_json(path)['threads'] util.remove(path) path = forum_folder + '/info.json' forum = util.read_json(path) forum_threads = [] for thread in reversed(threads): forum_threads.append({'id': thread['id']}) forum['num_threads'] = num_threads forum['threads'] = forum_threads util.write_json(path, forum)
def run_random_walks(data_dir, weight_edges=False): print "Loading data and building transition matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) if weight_edges: reviews = util.load_json('./data/' + data_dir + '/review.json') end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) edges = G.edges() for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): n1, n2 = str(e[0]), str(e[1]) if n1 not in reviews or n2 not in reviews[n1]: n1, n2 = n2, n1 G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90) del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0])]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print "Running random walks..." for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json(examples, './data/' + data_dir + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
def set_cli_defaults(code_root, cli_config, install_config): """Write install-time configuration options to the cli.jsonc file used to set run-time default values. """ def _set_cli_default(template, name, default): template[name] = default in_path = os.path.join(code_root, cli_config['config_in']) out_path = os.path.join(code_root, cli_config['config_out']) print("Writing default settings to {}".format(out_path)) try: cli_template = util.read_json(in_path) except Exception as exc: fatal_exception_handler(exc, "ERROR: Couldn't read {}.".format(in_path)) for key in cli_config['default_keys']: try: _set_cli_default(cli_template, key, install_config[key]) except Exception as exc: fatal_exception_handler(exc, "ERROR: {} not set".format(key)) if os.path.exists(out_path): print("{} exists; overwriting".format(out_path)) os.remove(out_path) try: util.write_json(cli_template, out_path, sort_keys=False) except Exception as exc: fatal_exception_handler(exc, "ERROR: Couldn't write {}.".format(out_path))
def svd_user_business(data_dir, k=50): print "Loading data and building user-business matrix..." users = util.load_json('./data/' + data_dir + '/user.json').keys() businesses = util.load_json('./data/' + data_dir + '/business.json').keys() examples = util.load_json('./data/' + data_dir + '/examples.json') user_to_row = dict(zip(users, range(len(users)))) business_to_column = dict(zip(businesses, range(len(businesses)))) user_business_matrix = sparse.lil_matrix((len(users), len(businesses)), dtype=float) with open('./data/' + data_dir + '/graph.txt') as f: for line in f: u, b = line.split() user_business_matrix[user_to_row[u], business_to_column[b]] = 1 user_business_matrix = sparse.csr_matrix(user_business_matrix) print "Computing singular value decomposition..." u, s, vt = sparse.linalg.svds(user_business_matrix, k=k) us = u * s print "Writing results..." for u in examples: for b in examples[u]: examples[u][b] = np.dot(us[user_to_row[u], :], vt[:, business_to_column[b]]) util.write_json(examples, './data/' + data_dir + '/svd.json')
def save_current_version(modules_iter): current_version={} for folder, conf in modules_iter: HEAD = git_command(folder, ["rev-parse", "HEAD"]) HEAD = HEAD.strip() current_version[conf["name"]] = HEAD write_json(".version.snapshot.json", current_version)
def main(args): data_file, result_file, csv_file = args[0], args[1], args[2] data = util.load_json(data_file) result = count_to_freq(data) util.write_json(result_file, result) # save .csv file text = make_csv(result['data']) util.write_replace(csv_file, text.encode('utf-8'))
def _download_old_quizzes(course, item, path): """ Download old version in-video quizzes. """ url = '{}/admin/quiz/quiz_load?quiz_id={}' url = url.format(course.get_url(), item['quiz']['parent_id']) util.download(url, path, course.get_cookie_file()) util.write_json(path, util.read_json(path))
def save_current_version(modules_iter): current_version = {} for folder, conf in modules_iter: HEAD = git_command(folder, ["rev-parse", "HEAD"]) HEAD = HEAD.strip() current_version[conf["name"]] = HEAD write_json(".version.snapshot.json", current_version)
def uplift(repo_dir, gaia_url, requirements): # Setup stuff t=util.time_start() print "Updating Gaia" git.create_gaia(repo_dir, gaia_url) # This is sadly broken print "Created Gaia in %0.2f seconds" % util.time_end(t) # Determining what needs to be uplifted with_commits = {} for bug_id in requirements.keys(): if requirements[bug_id].has_key('commits'): with_commits[bug_id] = requirements[bug_id] ordered_commits = order_commits(repo_dir, with_commits) uplift = dict([(x, {}) for x in ordered_commits]) # Uplifting for commit in ordered_commits: needed_on = [] for bug_id in with_commits.keys(): if commit in with_commits[bug_id]['commits']: for i in with_commits[bug_id]['needed_on']: if not i in needed_on: needed_on.append(i) print "\n", "="*80 print "Attempting to uplift %s commit to %s" % (commit, util.e_join(needed_on)) uplift[commit]['needed_on'] = needed_on result = uplift_commit(repo_dir, commit, needed_on) print "Sucess on %s" % util.e_join(result['success'].keys()) print "Failure on %s" % util.e_join(result['failure']) uplift[commit]['uplift_status'] = result uplift_report = copy.deepcopy(with_commits) # Determinging which commits belong to which bugs for bug_id in uplift_report.keys(): successful_branches = [] failed_branches = [] for commit in git.sort_commits(repo_dir, uplift_report[bug_id]['commits'], 'master'): if commit in uplift.keys(): if not uplift_report[bug_id].has_key('uplift_status'): uplift_report[bug_id]['uplift_status'] = {} u = uplift_report[bug_id]['uplift_status'] u[commit] = copy.deepcopy(uplift[commit]['uplift_status']) failed_branches.extend([x for x in u[commit]['failure'] if x not in failed_branches]) successful_branches.extend([x for x in u[commit]['success'].keys() if x not in successful_branches]) # Because we might have multiple commits, we want to make sure that the list of successful branches # includes only those with *no* failing uplifts for i in range(len(successful_branches) - 1, -1, -1): if successful_branches[i] in failed_branches: del successful_branches[i] uplift_report[bug_id]['flags_to_set'] = branch_logic.flags_to_set(successful_branches) util.write_json(uplift_dated_file, uplift_report) util.write_json(uplift_report_file, uplift_report) return uplift_report
def save_result(filename, data): o = { '_last_update': last_update(), '_meta': { 'char': len(list(data.keys())), 'count': sum(list(data.values())), }, 'data': data, } util.write_json(filename, o)
def _backup_config_file(self, config): """Record settings in file variab_dir/config_save.json for rerunning """ out_file = os.path.join(self.MODEL_WK_DIR, 'config_save.json') if not self.file_overwrite: out_file, _ = util_mdtf.bump_version(out_file) elif os.path.exists(out_file): print('Overwriting {}.'.format(out_file)) util.write_json(config.config.toDict(), out_file) return out_file
def create_package(folder, config, table, tag=None, github=True): """ Creates 'package.json' based on 'module.json'. """ replace_deps(config, table, "dependencies", tag, github) replace_deps(config, table, "devDependencies", tag, github) filename = os.path.join(folder, "package.json") print("Writing %s"%(filename)) write_json(filename, config)
def create_package(folder, config, table, tag=None, github=True): """ Creates 'package.json' based on 'module.json'. """ replace_deps(config, table, "dependencies", tag, github) replace_deps(config, table, "devDependencies", tag, github) filename = os.path.join(folder, "package.json") print("Writing %s" % (filename)) write_json(filename, config)
def get(self): """GET handler for gallery albums. URL pattern: /albums Returns 200 OK with JSON data structure containing list of albums. Returns Content-type: application/json. Returns 401 UNAUTHORIZED to all calls if authorization fails. """ write_json(self, [album.to_dict() for album in Album.all()])
def train_test(X_train, y_train, X_test, e_test, vectorizer): print "Training..." clf = GradientBoostingClassifier(n_estimators=2000, max_depth=4) clf.fit(X_train, y_train) print "Testing..." probas = clf.predict_proba(X_test)[:, 1] scores = defaultdict(dict) for (u, b), p in zip(e_test, probas): scores[u][b] = p util.write_json(scores, './data/test/supervised_classifier.json')
def create_branch(root_dir, config, branch_name): for m in config["modules"]: folder = os.path.join(root_dir, m["folder"]) argv = ["branch", branch_name] git_command(folder, argv) argv = ["checkout", branch_name] git_command(folder, argv) for m in config["modules"]: m["branch"] = branch_name write_json("project.json", config)
def supervised_methods(methods): num_features=14 #train dates for make dataset: 2012-01-01 to 2012-07-01; #for examples it is 2011-07-01 train=build_features("./data/train/user.json", "./data/train/business.json", "./data/train/examples.json", "./data/train/graph.txt", "./data/train/user_adamic_adar.json", "./data/train/biz_adamic_adar.json", "./data/train/user_cn.json", "./data/train/biz_cn.json", "./data/train/user_jaccard.json", "./data/train/biz_jaccard.json", num_features,datetime.date(2011,7,1)) #test dates fot the make dataset: 2013-01-01 to 2013-07-01; #for examples it is 2012-07-01 test=build_features( "./data/test/user.json", "./data/test/business.json", "./data/test/examples.json", "./data/test/graph.txt", "./data/test/user_adamic_adar.json", "./data/test/biz_adamic_adar.json", "./data/test/user_cn.json", "./data/test/biz_cn.json", "./data/test/user_jaccard.json", "./data/test/biz_jaccard.json", num_features,datetime.date(2012,7,1),True) for method in methods: clf=None if method=="RandomForest": clf=RandomForestClassifier(n_estimators=100, max_features=num_features, oob_score=True) elif method=="GBM": clf=GradientBoostingClassifier(n_estimators=100, max_features=num_features) else: continue clf=clf.fit(train["features"],train["target"]) probs=clf.predict_proba(test["features"]) prob_json=test["probs"] for u in prob_json: for b in prob_json[u]: prob_json[u][b]=float(probs[prob_json[u][b]][1]) util.write_json(prob_json,"./data/results/"+method+".json") with open("./data/results/"+method+"_scores.txt","w") as f: f.write("===feat. importance==="+str(clf.feature_importances_)+"\n") f.flush() f.close()
def export_metadata_json(T, path, prefix, indent): print("Writing out metaprocess") meta_json = {} meta_json["virus_count"] = T.count_terminals() from datetime.date import today meta_json["updated"] = today().strftime('%Y-%m-%d') meta_json["author_info"] = {} meta_json["seq_author_map"] = {} # join up config color options with those in the input JSONs. col_opts = process.config["auspice"]["color_options"] if process.colors: for trait, col in process.colors.iteritems(): if trait in col_opts: col_opts[trait]["color_map"] = col else: process.log.warn( "{} in colors (input JSON) but not auspice/color_options. Ignoring" .format(trait)) meta_json["color_options"] = col_opts if "date_range" in process.config["auspice"]: meta_json["date_range"] = process.config["auspice"]["date_range"] if "analysisSlider" in process.config["auspice"]: meta_json["analysisSlider"] = process.config["auspice"][ "analysisSlider"] meta_json["panels"] = process.config["auspice"]["panels"] meta_json["updated"] = time.strftime("X%d %b %Y").replace('X0', 'X').replace( 'X', '') meta_json["title"] = process.info["title"] meta_json["maintainer"] = process.info["maintainer"] meta_json["filters"] = process.info["auspice_filters"] if "defaults" in process.config["auspice"]: meta_json["defaults"] = process.config["auspice"]["defaults"] try: from pygit2 import Repository, discover_repository current_working_directory = os.getcwd() repository_path = discover_repository(current_working_directory) repo = Repository(repository_path) commit_id = repo[repo.head.target].id meta_json["commit"] = str(commit_id) except ImportError: meta_json["commit"] = "unknown" if len(process.config["auspice"]["controls"]): meta_json["controls"] = process.make_control_json( process.config["auspice"]["controls"]) meta_json["geo"] = process.lat_longs write_json(meta_json, prefix + '_meta.json')
def run_random_walks(data_dir, weight_edges=False): print("Loading data and building transition matrix...") examples = util.load_json('./data/' + data_dir + '/oag_examples_simple.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) # Get all nodes, but not the edges(those need to be predicted) with open('./data/nid_to_id.txt', 'r') as file: line = file.readline() while line: keys = line.split() if keys[0] not in G: G.add_node(keys[0]) line = file.readline() # Real id to substitute id #id_map = {} #count = 0 #for n in G: # id_map[n] = count # count += 1 #if weight_edges: # reviews = util.load_json('./data/' + data_dir + '/review.json') # end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) # edges = G.edges() # for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): # n1, n2 = str(e[0]), str(e[1]) # if n1 not in reviews or n2 not in reviews[n1]: # n1, n2 = n2, n1 # G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90) # del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[ 1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0]) ]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print("Running random walks...") for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() #row for adj matrix for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json( examples, './data/' + data_dir + ('/oag_weighted_random_walks.json' if weight_edges else '/oag_random_walks.json'))
def train(): phi = get_phi(True) print "Loading examples..." Ds, Ls = {}, {} examples = util.load_json('./data/train/examples.json') us = list(examples.keys()) random.seed(0) random.shuffle(us) for u in us: D, L = set(), set() for b in examples[u]: (D if examples[u][b] == 1 else L).add(int(b)) if len(D) > MAX_POSITIVE_EDGES_PER_USER: D = random.sample(D, MAX_POSITIVE_EDGES_PER_USER) if len(L) > MAX_NEGATIVE_EDGES_PER_USER: L = random.sample(L, MAX_POSITIVE_EDGES_PER_USER) if len(D) > 1 and len(L) > 10: Ds[int(u)] = list(D) Ls[int(u)] = list(L) if len(Ds) > NUM_TRAIN_USERS: break print "Setting initial conditions..." ps = {} for u in Ds: p = np.zeros(phi['bias'].shape[0]) p[u] = 1.0 ps[u] = sparse.csr_matrix(p) print "Training..." w = INITIAL_WEIGHTS best_loss = 100000 for i in range(100): print "ITERATION " + str(i + 1) + ": base" base_loss, ps = run(phi, w, Ds, Ls, ps) if base_loss < best_loss: best_loss = base_loss util.write_json(w, './data/supervised_random_walks_weights.json') partials = {} for k in w: print "ITERATION " + str(i + 1) + ": " + k new_w = w.copy() new_w[k] += H new_loss, _ = run(phi, new_w, Ds, Ls, ps) partials[k] = (new_loss - base_loss) / H print partials[k] * LEARNING_RATE for (k, dwk) in partials.iteritems(): w[k] -= LEARNING_RATE * dwk
def train(): phi = get_phi(True) print "Loading examples..." Ds, Ls = {}, {} examples = util.load_json("./data/train/examples.json") us = list(examples.keys()) random.seed(0) random.shuffle(us) for u in us: D, L = set(), set() for b in examples[u]: (D if examples[u][b] == 1 else L).add(int(b)) if len(D) > MAX_POSITIVE_EDGES_PER_USER: D = random.sample(D, MAX_POSITIVE_EDGES_PER_USER) if len(L) > MAX_NEGATIVE_EDGES_PER_USER: L = random.sample(L, MAX_POSITIVE_EDGES_PER_USER) if len(D) > 1 and len(L) > 10: Ds[int(u)] = list(D) Ls[int(u)] = list(L) if len(Ds) > NUM_TRAIN_USERS: break print "Setting initial conditions..." ps = {} for u in Ds: p = np.zeros(phi["bias"].shape[0]) p[u] = 1.0 ps[u] = sparse.csr_matrix(p) print "Training..." w = INITIAL_WEIGHTS best_loss = 100000 for i in range(100): print "ITERATION " + str(i + 1) + ": base" base_loss, ps = run(phi, w, Ds, Ls, ps) if base_loss < best_loss: best_loss = base_loss util.write_json(w, "./data/supervised_random_walks_weights.json") partials = {} for k in w: print "ITERATION " + str(i + 1) + ": " + k new_w = w.copy() new_w[k] += H new_loss, _ = run(phi, new_w, Ds, Ls, ps) partials[k] = (new_loss - base_loss) / H print partials[k] * LEARNING_RATE for (k, dwk) in partials.iteritems(): w[k] -= LEARNING_RATE * dwk
def download_info(self): url = self.url temp = self.info_folder + '/temp.html' util.download(url, temp, self.cookie_file) page_html = util.read_file(temp) util.remove(temp) info_files = ['user.json', 'course.json', 'sidebar.json'] matches = re.findall(r'JSON\.parse\("(.*?)"\);', page_html) for match, info_file in zip(matches, info_files)[1:]: info = util.unicode_unescape(match).replace('\\\\', '') path = '{}/{}'.format(self.info_folder, info_file) util.write_json(path, util.read_json(info, True))
def predict(): res_lst = read_json(test_select_path) for item in res_lst: item['answer'] = get_ans(item['question'], item['label'], item['answer_sentence'][0]) res_lst.sort(key=lambda val: val['qid']) write_json(test_span_path, res_lst) # 对结果数组按照qid升序排列,可注释掉此行 write_json(test_answer_path, [{ 'qid': item['qid'], 'question': ''.join(item['question']), 'answer_pid': [item['pid']], 'answer': item['answer'] } for item in res_lst])
def write_ofac_id_matches(infile, outfile): table = str.maketrans(dict.fromkeys(string.punctuation + '\n')) data = {} # { sdn_id : ofac_website_id } ofac_name_to_id = {} # {ofac_name : ofac_website_id} with open(infile) as f: for line in f: ofac_id, name = line.split('|') name = name.lower().translate(table).strip() ofac_name_to_id[name] = ofac_id ofac_names = ofac_name_to_id.keys() entries = get_names_from_elastic() num_not_found = 0 for entry in entries: sdn_id = entry['_id'] name = entry['_source']['primary_display_name'].lower().translate( table).strip() try: best_match = get_close_matches(name, ofac_names, n=1, cutoff=1.0)[0] ofac_website_id = ofac_name_to_id[best_match] data[sdn_id] = ofac_website_id except: # Try to transpose the words in a name and search for them found = False new = name for _ in range(name.count(' ')): first, space, last = new.partition(' ') new = last + ' ' + first try: best_match = get_close_matches(new, ofac_names, n=1, cutoff=1.0)[0] ofac_website_id = ofac_name_to_id[best_match] data[sdn_id] = ofac_website_id found = True break except: pass if not found: num_not_found += 1 log( f'{num_not_found} IDs were unable to be matched to their OFAC website counterpart', 'warning') util.write_json(outfile, data)
def post(self, album_id): """POST handler for a gallery image. URL pattern: /albums/${album_id}/images POST data must be of type multipart/form and contain image as 'file'. POST data must also contain image metadata: 'name'. Image filename must include an extension. Returns 201 CREATED with JSON data structure describing new image. Returns Content-type: application/json. Also returns Location header pointing to API URL for image details. Include 'wrapjson' parameter in POST to wrap returns JSON in a <textarea>. This also changes the returned Content-type to text/html. If album doesn't exist, returns 404 NOT FOUND. If request is poorly formatted returns 400 BAD REQUEST. Returns 401 UNAUTHORIZED to all calls if authorization fails. """ q = Album.all().filter('album_id =', album_id) album = q.get() if not album: return self.error(404) try: data = dict(((str(k), v) for k, v in self.request.POST.items())) if 'file' in data: data['extension'] = data['file'].filename.split('.')[-1].lower() if data['extension'] == data['file'].filename: data['extension'] = '' else: data['extension'] = '.' + data['extension'] data['image_data'] = data['file'].file.read() image = Image(image_id=config.IMAGE_ID_GENERATOR(), album=album, **data) except: data = {} self.error(400) else: if not config.DEMO_MODE: image.put() data = image.to_dict() self.response.headers['Location'] = data['url'] self.response.set_status(201) write_json(self, data, wrapjson='wrapjson' in self.request.POST)
def supervised_methods(methods): num_features = 14 #train dates for make dataset: 2012-01-01 to 2012-07-01; #for examples it is 2011-07-01 train = build_features( "./data/train/user.json", "./data/train/business.json", "./data/train/examples.json", "./data/train/graph.txt", "./data/train/user_adamic_adar.json", "./data/train/biz_adamic_adar.json", "./data/train/user_cn.json", "./data/train/biz_cn.json", "./data/train/user_jaccard.json", "./data/train/biz_jaccard.json", num_features, datetime.date(2011, 7, 1)) #test dates fot the make dataset: 2013-01-01 to 2013-07-01; #for examples it is 2012-07-01 test = build_features("./data/test/user.json", "./data/test/business.json", "./data/test/examples.json", "./data/test/graph.txt", "./data/test/user_adamic_adar.json", "./data/test/biz_adamic_adar.json", "./data/test/user_cn.json", "./data/test/biz_cn.json", "./data/test/user_jaccard.json", "./data/test/biz_jaccard.json", num_features, datetime.date(2012, 7, 1), True) for method in methods: clf = None if method == "RandomForest": clf = RandomForestClassifier(n_estimators=100, max_features=num_features, oob_score=True) elif method == "GBM": clf = GradientBoostingClassifier(n_estimators=100, max_features=num_features) else: continue clf = clf.fit(train["features"], train["target"]) probs = clf.predict_proba(test["features"]) prob_json = test["probs"] for u in prob_json: for b in prob_json[u]: prob_json[u][b] = float(probs[prob_json[u][b]][1]) util.write_json(prob_json, "./data/results/" + method + ".json") with open("./data/results/" + method + "_scores.txt", "w") as f: f.write("===feat. importance===" + str(clf.feature_importances_) + "\n") f.flush() f.close()
def svd(data_dir, k=50): print "Loading data and building adjacency matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) adjacency_matrix = sparse.csr_matrix(nx.adjacency_matrix(G), dtype=float) print "Computing singular value decomposition..." u, s, vt = sparse.linalg.svds(adjacency_matrix, k=k) us = u * s print "Writing results..." for u in examples: for b in examples[u]: examples[u][b] = np.dot(us[u, :], vt[:, b]) util.write_json(examples, './data/' + data_dir + '/svd.json')
def run_minnpm(self, pkg_info): start_time = time.time() try: (tgz, pkg_target, mode_configuration) = pkg_info pkg_path = f'{pkg_target}/package' output_path = f'{pkg_path}/experiment.out' output_status_path = f'{pkg_path}/experiment.json' error_status_path = f'{pkg_path}/error.json' self.unpack_tarball_if_needed(tgz, pkg_target) with open(output_path, 'wt') as out: exit_code = subprocess.Popen(solve_command(mode_configuration), cwd=pkg_path, stdout=out, stderr=out).wait(self.timeout) duration = time.time() - start_time if exit_code == 0: write_json(output_status_path, { 'status': 'success', 'time': duration }) return None status = self.get_npmstatus(output_path) if status in [ 'ERESOLVE', 'ETARGET', 'EUNSUPPORTEDPROTOCOL', 'EBADPLATFORM' ]: # TODO(arjun): This is for compatibility with older data. If # we do a totally fresh run, can refactor to stick reason into # status and remove the 'cannot_install' status. write_json(output_status_path, { 'status': 'cannot_install', 'reason': status }) return None write_json(error_status_path, { 'status': 'unexpected', 'detail': output_path }) return f'Failed: {pkg_path}' except subprocess.TimeoutExpired: write_json(error_status_path, {'status': 'timeout'}) return f'Timeout: {pkg_path}' except BaseException as e: write_json(error_status_path, { 'status': 'unexpected', 'detail': e.__str__() }) return f'Exception: {pkg_path} {e}'
def test(): phi = get_phi(False) examples = util.load_json("./data/test/examples.json") w = util.load_json("./data/supervised_random_walks_weights.json") print "Computing Q and initializing..." Q = get_Q(phi, w) ps = {} for u in examples: p = np.zeros(phi["bias"].shape[0]) p[int(u)] = 1.0 ps[int(u)] = sparse.csr_matrix(p) get_ps(Q, ps, max_iter=20, convergence_criteria=0, log=True, examples=examples) print "Writing..." util.write_json(examples, "./data/test/supervised_random_walks.json")
def test(): phi = get_phi(False) examples = util.load_json('./data/test/examples.json') w = util.load_json('./data/supervised_random_walks_weights.json') print "Computing Q and initializing..." Q = get_Q(phi, w) ps = {} for u in examples: p = np.zeros(phi['bias'].shape[0]) p[int(u)] = 1.0 ps[int(u)] = sparse.csr_matrix(p) get_ps(Q, ps, max_iter=20, convergence_criteria=0, log=True, examples=examples) print "Writing..." util.write_json(examples, './data/test/supervised_random_walks.json')
def get(self, album_id, image_id, extension=None): """GET handler for GGB image metadata and files. URL pattern: /albums/${album_id}/images/${image_id}(${extension}) If called without a file extension: If image exists, returns 200 OK with JSON image data structure. Returns Content-type: application/json. If image doesn't exist, returns 404 NOT FOUND. If called with a file extension: If image exists and has the matching extension, returns the image. Returned Content-type matches the image format. Otherwise returns 404 NOT FOUND. Returns 401 UNAUTHORIZED to all calls if authorization fails. """ q = Album.all().filter('album_id =', album_id) album = q.get() if not album: return self.error(404) q = Image.all().filter('album =', album).filter('image_id =', image_id) image = q.get() if not image: return self.error(404) if not extension: data = image.to_dict() return write_json(self, image.to_dict()) if extension != image.extension: return self.error(404) write_image(self, image.image_data, image.extension)
def predict(similarity_func=calc_inner_product ): # 对测试集进行预测,要求在此函数前必须执行了vsm_init()函数. if similarity_func.__name__ not in [ calc_cosine.__name__, calc_inner_product.__name__, calc_jaccard.__name__ ]: print('错误的输入相似度计算函数...') return test_lst = read_json(test_path) for q_item in test_lst: q_item['question'] = seg_line(q_item['question']) # 分词 q_item['pid'] = int( similarity_func( {word: idf.get(word, 0) for word in q_item['question']})[0][0]) write_json(test_predict_path, test_lst)
def get_nutrient_df(index): filename = "ingredient_{}.csv".format(str(index)) ing_lst = pd.read_csv(filename).key.to_list() appid, key = ID[index % 5] headers = { "Content-Type": "application/json", "x-app-id": appid, "x-app-key": key, "x-remote-user-id": '0' } ing_dict = dict() for ing in ing_lst: ing_dict[ing] = get_nutrient(ing, headers) output = str(index) + ".json" util.write_json(ing_dict, output)
def read_pokemonid(): # data -> http://www.pokemon.com/us/pokedex/ list_pokemons = [] pokemons = {} with open("id.txt", 'r', encoding='utf-8') as datafile: for line in datafile: if line != '\n': pokemon = {} temp = line.replace('\n', '').split(' ') list_pokemons.append([temp[0], int(temp[1])]) for i, pokemon in enumerate(list_pokemons): pokemons[i + 1] = pokemon[0] util.write_json(pokemons, 'id.json')
def push(repo_dir): branches = c.read_value('repository.enabled_branches') preview_push_info = git.push(repo_dir, remote="origin", branches=branches, dry_run=True) print "If you push, you'd be pushing: " _display_push_info(preview_push_info) if not util.ask_yn('Do you wish to push?'): return None for i in range(5): try: rv = git.push(repo_dir, remote="origin", branches=branches, dry_run=False) util.write_json(push_info_file, rv) print "Push attempt %d worked" % int(i+1) return rv except: print "Push attempt %d failed" % int(i+1) raise git.PushFailure("remote %s branches %s" % (remote, util.e_join(branches)))
def for_all_bugs(repo_dir, requirements, upstream="master"): # Let's see if we have any commits in the req file. any_bug_has_commits = False bugs_without_commits = [] for bug_id in requirements: if requirements[bug_id].has_key('commits'): if len(requirements[bug_id]['commits']) > 0: any_bug_has_commits = True else: bugs_without_commits.append(bug_id) if any_bug_has_commits: print "Some bugs in this requirements file already have commits." # reuse is use the existing commits, don't ask for more. # add is use the existing commits for bugs that have no commits, ignore others # delete will remove the commits from the requirements dictionary prompt = "Enter 'reuse', 'add' or 'delete': " user_input = raw_input(prompt).strip() while user_input not in ('reuse', 'add', 'delete'): user_input = raw_input(prompt).strip() if user_input == 'reuse': bugs_to_find = [] # just use what's in the file elif user_input == 'add': bugs_to_find = bugs_without_commits # Only ask for commits for commit-less bugs elif user_input == 'delete': # Delete the commits that are in the requirements file for bug_id in requirements.keys(): if requirements[bug_id].has_key('commits'): del requirements[bug_id]['commits'] util.write_json(uplift.requirements_file, requirements) bugs_to_find = requirements.keys() else: raise Exception("Huh?") else: bugs_to_find = requirements.keys() pruned_bugs_to_find = [x for x in bugs_to_find if not uplift.is_skipable(x)] j=0 for bug_id in sorted(pruned_bugs_to_find): j+=1 print "=" * 80 print "Bug %d of %d" % (j, len(pruned_bugs_to_find)) bug = bzapi.fetch_complete_bug(bug_id, cache_ok=True) requirements[bug_id]['commits'] = for_one_bug(repo_dir, bug_id, bug, upstream) util.write_json(uplift.requirements_file, requirements) return requirements
def get(self, album_id): """GET handler for a particular gallery album. URL pattern: /albums/${album_id} If album exists, returns 200 OK with JSON album data structure. Returns Content-type: application/json. If album doesn't exist, returns 404 NOT FOUND. Returns 401 UNAUTHORIZED to all calls if authorization fails. """ q = Album.all().filter('album_id =', album_id) album = q.get() if not album: return self.error(404) write_json(self, album.to_dict())
def download(course, item): """ Download peer-grading JSON. :param course: A Course object. :param item: This JSON item is directly written into saved file. :return: None. """ path = "{}/peer_assessment/{}.json" path = path.format(course.get_folder(), item["item_id"]) util.make_folder(path, True) util.write_json(path, item) content = util.read_file(path) content = util.remove_coursera_bad_formats(content) util.write_file(path, content)
def increment_version(folder, config, level): # - iterate through all projects # - read the VERSION file and compare to previous version (given) # - if equal, automatically increase the version (on the given level) # - overwrite the VERSION if not "version" in config: print("Could not find 'version' in config of %s"%folder) return None version = SemanticVersion(config["version"]); version.increment(level) config["version"] = version.str(); print ("Writing new version: %s"%(version.str())) write_json(module_file(folder), config)
def get_save_map_info(): min_lat = min_lon = MAX_NUM max_lat = max_lon = -MAX_NUM for item in Ways_dict.values(): for node in item[u"nodes"]: tmp_lat = float(node[u"lat"]) tmp_lon = float(node[u"lon"]) min_lat = min(min_lat, tmp_lat) min_lon = min(min_lon, tmp_lon) max_lat = max(max_lat, tmp_lat) max_lon = max(max_lon, tmp_lon) num_lat = int(math.ceil((max_lat - min_lat) / STEP) + 0.1) num_lon = int(math.ceil((max_lon - min_lon) / STEP) + 0.1) num_grids = num_lat * num_lon map_info = min_lat, max_lat, min_lon, max_lon, num_lat, num_lon, num_grids util.write_json(MAP_INFO, INTER_DATA_DIR, map_info) return min_lat, max_lat, min_lon, max_lon, num_lat, num_lon, num_grids
def shrinkwrap(self, args=None): print("Creating npm-shrinkwrap.json...") npm_shrinkwrap(self.root_dir) shrinkwrap_file = os.path.join(self.root_dir, "npm-shrinkwrap.json") shrinkwrap_conf = read_json(shrinkwrap_file) project_config = self.get_project_config() deps = shrinkwrap_conf['dependencies'] devDeps = shrinkwrap_conf['devDependencies'] if 'devDependencies' in shrinkwrap_conf else {} for m, __, conf in iterate_modules(self.root_dir, project_config): name = conf['name'] repo = "git+%s#%s"%(m['repository'],m['branch']) if name in deps: entry = deps[name] entry['from'] = repo if name in devDeps: entry = deps[name] entry['from'] = repo write_json(shrinkwrap_file, shrinkwrap_conf)
def make_dataset(t1, t2, out_dir): if not os.path.exists(out_dir): os.makedirs(out_dir) # we need to map the ids in the yelp data to ints since snap only allows ints as node ids id_to_nid = KeyToInt() print "Building set of nodes..." nids = set() for review in reviews_iterator(): if get_date(review) < t1: nids.add(id_to_nid['u' + review['user_id']]) nids.add(id_to_nid['b' + review['business_id']]) print "Building user data..." write_node_data(lambda user_data: id_to_nid['u' + user_data['user_id']], nids, './data/provided/yelp_academic_dataset_user.json', out_dir + 'user.json') print "Building business data..." write_node_data(lambda business_data: id_to_nid['b' + business_data['business_id']], nids, './data/provided/yelp_academic_dataset_business.json', out_dir + 'business.json') print "Building graph..." with open(out_dir + 'graph.txt', 'w') as graph, \ open(out_dir + 'new_edges.txt', 'w') as new_edges: review_data = defaultdict(lambda: defaultdict(list)) for review in reviews_iterator(): user_key = id_to_nid['u' + review['user_id']] business_key = id_to_nid['b' + review['business_id']] if user_key in nids and business_key in nids: date = get_date(review) if date < t1: review_data[user_key][business_key].append(review) graph.write("{:} {:}\n".format(user_key, business_key)) elif date < t2: new_edges.write("{:} {:}\n".format(user_key, business_key)) for u in review_data: for b in review_data[u]: review_data[u][b] = sorted(review_data[u][b], key=get_date, reverse=True) util.write_json(review_data, out_dir + "review.json")