示例#1
0
def docs(dataset_name):
    p = util.Progbar(target=(util.lines_in_file(directories.RAW +
                                                dataset_name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW +
                                               dataset_name)):
        p.update(i + 1)
        yield d
示例#2
0
    def __init__(self,
                 load=False,
                 vectors_file=directories.PRETRAINED_WORD_VECTORS,
                 keep_all_words=False):
        if load:
            self.vocabulary = util.load_pickle(directories.RELEVANT_VECTORS +
                                               'vocabulary.pkl')
            self.vectors = np.load(directories.RELEVANT_VECTORS +
                                   'word_vectors.npy')
            self.d = self.vectors.shape[1]
        else:
            self.vocabulary = {}
            self.vectors = []
            word_counts = util.load_pickle(directories.MISC +
                                           'word_counts.pkl')
            with open(vectors_file) as f:
                for line in f:
                    split = line.decode('utf8').split()
                    w = normalize(split[0])
                    if w not in self.vocabulary and (w == UNKNOWN_TOKEN
                                                     or w in word_counts
                                                     or keep_all_words):
                        vec = np.array(map(float, split[1:]), dtype='float32')
                        if not self.vectors:
                            self.d = vec.size
                            self.vectors.append(np.zeros(
                                self.d))  # reserve 0 for mask
                        self.vocabulary[w] = len(self.vectors)
                        self.vectors.append(vec)

            n_unkowns = len(
                [w for w in word_counts if w not in self.vocabulary])
            unknown_mass = sum(
                c for w, c in word_counts.iteritems()
                if c < ADD_WORD_THRESHOLD and w not in self.vocabulary)
            total_mass = sum(word_counts.values())
            print "Pretrained embedding size:", util.lines_in_file(
                vectors_file)
            print "Unknowns by mass: {:}/{:} = {:.2f}%%"\
                .format(unknown_mass, total_mass, 100 * unknown_mass / float(total_mass))
            print "Unknowns by count: {:}/{:} = {:.2f}%%"\
                .format(n_unkowns, len(word_counts), 100 * n_unkowns / float(len(word_counts)))

            for c, w in sorted([(w, c) for c, w in word_counts.iteritems()],
                               reverse=True):
                if w not in self.vocabulary and c > ADD_WORD_THRESHOLD:
                    print "Adding", w, "count =", c
                    self.add_vector(w)
            if UNKNOWN_TOKEN not in self.vocabulary:
                print "No presupplied unknown token"
                self.add_vector(UNKNOWN_TOKEN)
            self.add_vector(MISSING_TOKEN)
        self.unknown = self.vocabulary[UNKNOWN_TOKEN]
        self.missing = self.vocabulary[MISSING_TOKEN]
def print_dataset_stats(data_dir):
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    print "Num nodes:", G.GetNodes()
    print "Num edges:", G.GetEdges()

    n_users = len(util.load_json(data_dir + "user.json"))
    n_businesses = len(util.load_json(data_dir + "business.json"))
    n_edges = util.lines_in_file(data_dir + "new_edges.txt")
    print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format(
        n_users, n_businesses, n_users * n_businesses)
    print "{:} edges, {:0.5f}% of candidate edges".format(n_edges, 100 * n_edges /
                                                          float(n_users * n_businesses))
示例#4
0
def print_dataset_stats(data_dir):
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    print "Num nodes:", G.GetNodes()
    print "Num edges:", G.GetEdges()

    n_users = len(util.load_json(data_dir + "user.json"))
    n_businesses = len(util.load_json(data_dir + "business.json"))
    n_edges = util.lines_in_file(data_dir + "new_edges.txt")
    print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format(
        n_users, n_businesses, n_users * n_businesses)
    print "{:} edges, {:0.5f}% of candidate edges".format(
        n_edges, 100 * n_edges / float(n_users * n_businesses))
示例#5
0
    def do_post_update(self):
        #return;
        if util.lines_in_file(self.fn_dest) > 0:
            lines_seen = set()  # holds lines already seen
            unique_lines = []
            for line in open(self.fn_dest, "r"):
                if line not in lines_seen:  # not a duplicate
                    unique_lines.append(line)
                    lines_seen.add(line)

            outfile = open(self.fn_dest, 'w')
            outfile.writelines(unique_lines)
            outfile.close()
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(util.load_json_lines(path),
                            util.LoopLogger(100000, util.lines_in_file(path), True))
示例#7
0
def build_dataset(vectors,
                  name,
                  tune_fraction=0.0,
                  reduced=False,
                  columns=None):
    doc_vectors = util.load_pickle(directories.MISC +
                                   name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print "Building dataset", name
    p = util.Progbar(
        target=(2 if reduced else util.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(
                d["mentions"][mention_num], vectors,
                doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(),
                          key=lambda k:
                          (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1],
                           mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)
示例#8
0
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(
        util.load_json_lines(path),
        util.LoopLogger(100000, util.lines_in_file(path), True))