def docs(dataset_name): p = util.Progbar(target=(util.lines_in_file(directories.RAW + dataset_name))) for i, d in enumerate(util.load_json_lines(directories.RAW + dataset_name)): p.update(i + 1) yield d
def write_feature_names(): util.write_pickle( { f: i for i, f in enumerate( next(util.load_json_lines(directories.RAW + 'train'))["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl')
def main(): ''' The integration test runs agains the input file and compares against the output file, while writing its own output results.txt file ''' def process_item(data): db = CustomerDatabase() success, code = db.deposit(data["customer_id"], data["id"], data["time"], data["load_amount"]) if code != 403: data = write_output(data["customer_id"], data["id"], success) else: data = dict(id=data["id"], customer_id=data["customer_id"], accepted=False) return data, code def write_output(customer_id, txn_id, status): data = dict(id=txn_id, customer_id=customer_id, accepted=status) logger.info(json.dumps(data)) return data with open(RESULTS_FILE, 'w') as out: todos = load_json_lines(INPUT_FILE) answers = load_json_lines(OUTPUT_FILE) counter = 0 bad_counter = 0 assert len(todos) == 1000 for to_do in todos: result, code = process_item(to_do) if code != 403: result["original_accepted"] = answers[counter]["accepted"] if result["accepted"] != answers[counter]["accepted"]: logger.debug("FAIL") bad_counter +=1 else : print("original =", result["original_accepted"], "found=", result["accepted"]) counter += 1 print("MISSED ITEMS: ", bad_counter)
def load_gold(dataset_name): gold = {} mention_to_gold = {} for doc_gold in util.load_json_lines(directories.GOLD + dataset_name): did = int(doc_gold.keys()[0]) gold[did] = doc_gold[str(did)] mention_to_gold[did] = {} for gold_cluster in doc_gold[str(did)]: for m in gold_cluster: mention_to_gold[did][m] = tuple(gold_cluster) return gold, mention_to_gold
def explore_pairwise_features(): pos_sum, neg_sum = np.zeros(9), np.zeros(9) pos_count, neg_count = 0, 0 for i, d in enumerate(util.load_json_lines(directories.RAW + "train")): for key in d["labels"].keys(): if d["labels"][key] == 1: pos_sum += d["pair_features"][key] pos_count += 1 else: neg_sum += d["pair_features"][key] neg_count += 1 print "positive counts", list(pos_sum) print "negative counts", list(neg_sum) print "feature odds", list( np.divide(pos_sum / pos_count, (pos_sum / pos_count + neg_sum / neg_count))) print
def write_node_data(nid_f, nids, infile, outfile): return util.write_json({nid_f(datum): datum for datum in util.load_json_lines(infile) if nid_f(datum) in nids}, outfile)
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop(util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = util.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print "Building dataset", name p = util.Progbar( target=(2 if reduced else util.lines_in_file(directories.RAW + name))) for i, d in enumerate(util.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention( d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)
def write_node_data(nid_f, nids, infile, outfile): return util.write_json( { nid_f(datum): datum for datum in util.load_json_lines(infile) if nid_f(datum) in nids }, outfile)
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop( util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))