def tokenizer_from_json(json_string): """Parses a JSON tokenizer configuration file and returns a tokenizer instance. # Arguments json_string: JSON string encoding a tokenizer configuration. # Returns A Keras Tokenizer instance """ tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def load_tokenizer_from_file(filename): tokenizer = Tokenizer() with open(filename, 'r') as infile: tokenizer_data = json.load(infile) tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts']) tokenizer.word_docs = tokenizer_data['word_docs'] tokenizer.word_index = tokenizer_data['word_index'] tokenizer.document_count = tokenizer_data['document_count'] tokenizer.index_docs = tokenizer_data['index_docs'] return tokenizer
def tokenizer_from_json(json_string): tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def datagen(max_posts, max_length, stype='training', batch_size=32, force_full=False, randposts=False, mintf=1, mindf=2, noempty=False, prep=None, returntok=False, balbatch=True): assert stype in ['training', 'validation', 'testing'] looponce = force_full or stype != 'training' fn = 'rsdd_posts/%s.gz' % stype print("loading %s posts" % stype) f = gzip.open(fn, 'rt') labels = {} allposts = {} for i, line in enumerate(f): user = str(i) d = json.loads(line)[0] if d['label'] == 'control': labels[user] = np.array([1, 0], dtype=np.float32) elif d['label'] == 'depression': labels[user] = np.array([0, 1], dtype=np.float32) elif d['label'] is None: continue else: raise RuntimeError("unknown label: %s" % d['label']) allposts[user] = [post for dt, post in d['posts']] f.close() tokfn = "tok_tf%s_df%s.p" % (mintf, mindf) load_tokenizer = looponce or os.path.exists(tokfn) if load_tokenizer: print("loading tokenizer") tok = pickle.load(open(tokfn, 'rb')) else: assert stype == 'training', "cannot fit tokenizer on validation or testing data" print("tokenizing %s users" % len(allposts)) tok = Tokenizer(nb_words=None) tok.fit_on_texts(post for uposts in allposts.values() for post in uposts) # remove all tokens with a low DF or TF removed = 0 for term in list(tok.word_index.keys()): if tok.word_docs[term] < mindf or tok.word_counts[term] < mintf: removed += 1 del tok.word_docs[term] del tok.word_counts[term] del tok.word_index[term] tok.index_docs = None idxs = {} nexti = 1 for term, oldi in sorted(tok.word_index.items()): idxs[term] = nexti nexti += 1 assert len(tok.word_index) == len(idxs) tok.word_index = idxs print("terms removed: %s; remaining: %s" % (removed, len(tok.word_index))) pickle.dump(tok, open(tokfn, 'wb'), protocol=-1) nb_words = len(tok.word_index) + 1 # remove empty posts if noempty: noempty_cache = "noempty_tf%s_df%s_%s_mp%s_ml%s.p" % ( mintf, mindf, max_posts, max_length, stype) if os.path.exists(noempty_cache): print("loading cached noempty posts") allposts, before, after = pickle.load(open(noempty_cache, 'rb')) else: print("removing empty posts") before, after = [], [] for user in list(allposts.keys()): before.append(len(allposts[user])) kept = [] for upost in allposts[user]: skip = True for term in text_to_word_sequence(upost): if term in tok.word_index: skip = False break if not skip: kept.append(upost) if len(kept) > 0: allposts[user] = kept after.append(len(allposts[user])) else: del allposts[user] import scipy.stats print("posts before noempty:", scipy.stats.describe(before)) print("posts after noempty:", scipy.stats.describe(after)) print("#users before vs. after: %s vs. %s" % (len(before), len(after))) pickle.dump((allposts, before, after), open(noempty_cache, 'wb'), protocol=-1) print("found %s words; generator ready" % nb_words) def vecify(uposts): assert prep is None or not randposts, "incompatible" if randposts or prep == 'bran': idxs = np.random.permutation(min(max_posts, len(uposts))) chosen = [uposts[idx] for idx in idxs] elif prep == 'dist': if max_posts >= len(uposts): chosen = uposts[:max_posts] else: idxs = np.linspace(0, len(uposts) - 1, num=max_posts, dtype=np.int) chosen = [uposts[idx] for idx in idxs] elif prep == 'rev': chosen = uposts[-max_posts:] else: chosen = uposts[:max_posts] seqs = pad_sequences(tok.texts_to_sequences(chosen), maxlen=max_length) if len(seqs) < max_posts: seqs = np.pad(seqs, ((0, max_posts - len(seqs)), (0, 0)), mode='constant') return seqs if looponce: def gen(meta=False): X, y = [], [] extra = [] while True: for user, uposts in allposts.items(): X.append(vecify(uposts)) y.append(labels[user]) if meta: extra.append((user, len(uposts))) if len(X) == batch_size: X, y = np.array(X), np.array(y) print("...shouldn't happen") yield (X, y) X, y = [], [] if looponce and len(X) > 0: X, y = np.array(X), np.array(y) if meta: yield (X, y, extra) X, y, extra = [], [], [] else: yield (X, y) X, y = [], [] if looponce: break else: def gen_nbb(): bylabel = {} for user, uposts in allposts.items(): label = np.argmax(labels[user]) bylabel.setdefault(label, []).append(uposts) print([(k, len(v)) for k, v in bylabel.items()]) X, y = [], [] neglabel = np.array([1, 0], dtype=np.float32) poslabel = np.array([0, 1], dtype=np.float32) poscount = len(bylabel[1]) while True: idxs = ([(1, i) for i in np.random.permutation(poscount)] + [ (0, i) for i in np.random.permutation(len(bylabel[0]))[:poscount] ]) idxs = [idxs[i] for i in np.random.permutation(len(idxs))] for label, idx in idxs: X.append(vecify(bylabel[label][idx])) if label == 0: y.append(neglabel) elif label == 1: y.append(poslabel) else: raise RuntimeError("invalid label: %s" % label) if len(X) == batch_size: X, y = np.array(X), np.array(y) yield (X, y) X, y = [], [] def gen_bal(): bylabel = {} for user, uposts in allposts.items(): label = np.argmax(labels[user]) bylabel.setdefault(label, []).append(uposts) print([(k, len(v)) for k, v in bylabel.items()]) assert batch_size % len(bylabel) == 0 idxs = {} for label in bylabel: idxs[label] = list(range(len(bylabel[label]))) X, y = [], [] neglabel = np.array([1, 0], dtype=np.float32) poslabel = np.array([0, 1], dtype=np.float32) while True: for label in bylabel: random.shuffle(idxs[label]) for posidx, negidx in zip(idxs[1], idxs[0]): X.append(vecify(bylabel[1][posidx])) y.append(poslabel) X.append(vecify(bylabel[0][negidx])) y.append(neglabel) if len(X) == batch_size: X, y = np.array(X), np.array(y) yield (X, y) X, y = [], [] if balbatch: gen = gen_bal else: gen = gen_nbb if returntok: return nb_words, gen, tok else: return nb_words, gen
config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer def create_tf_example_row(input_row): # convert to string password = str(input_row[0])