def __init__(self, expdir): self.expdir = expdir self.params = helper.GetParams( os.path.join(expdir, 'char_vocab.pickle'), 'eval', expdir) self.char_vocab = Vocab.Load(os.path.join(expdir, 'char_vocab.pickle')) self.user_vocab = Vocab.Load(os.path.join(expdir, 'user_vocab.pickle')) self.params.vocab_size = len(self.char_vocab) self.params.user_vocab_size = len(self.user_vocab) # construct the tensorflow graph self.graph = tf.Graph() with self.graph.as_default(): self.model = Model(self.params, training_mode=False) self.char_tensor = tf.constant(self.char_vocab.GetWords(), name='char_tensor') self.beam_chars = tf.nn.embedding_lookup(self.char_tensor, self.model.selected)
if args.mode in ('train', 'eval', 'classify'): mode = args.mode if args.partition_override: mode = 'all' dataset = Dataset(max_len=params.max_len + 1, preshuffle=args.mode=='train', batch_size=params.batch_size) print 'reading data' dataset.ReadData(args.data, params.context_vars + ['text'], mode=mode, splitter=params.splitter) if args.mode == 'train': if args.vocab is not None: vocab = Vocab.Load(args.vocab) else: min_count = 20 if hasattr(params, 'min_vocab_count'): min_count = params.min_vocab_count vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count) context_vocabs = {} for context_var in params.context_vars: v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)], min_count=50, no_special_syms=True) context_vocabs[context_var] = v print 'num {0}: {1}'.format(context_var, len(v)) vocab.Save(os.path.join(args.expdir, 'word_vocab.pickle')) print 'vocab size {0}'.format(len(vocab)) with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'wb') as f:
""" parser = argparse.ArgumentParser() parser.add_argument('--expdir', type=str, help='experiment directory', default='../models/w2v_init') parser.add_argument('--datadir', type=str, help='where to find the non-community members') parser.add_argument('--communities', type=str, default='../data/communities.csv.gz', help='csv file to load the community tweets from') args = parser.parse_args() vocab = Vocab.Load('../data/vocab.txt') # load the communities print 'loading communities' df = pandas.read_csv(args.communities, dtype={'user': str}) # load all the random people print 'loading randos' randos = [] filenames = glob.glob(os.path.join(args.datadir, '*.csv')) for name in filenames: randos.append( pandas.read_csv(name, dtype={'user': str}, usecols=['text', 'user', 'timestamp'])) randos = pandas.concat(randos)
from batcher import Dataset from char2vec import CharCNN as Char2Vec from vocab import Vocab parser = argparse.ArgumentParser() parser.add_argument('expdir') args = parser.parse_args() config = tf.ConfigProto(inter_op_parallelism_threads=10, intra_op_parallelism_threads=10) dataset = Dataset(10, preshuffle=False) dataset.ReadData('../data/tweetlid/training.tsv.gz', 'all', 'tweet') input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=1) char_vocab = Vocab.Load(os.path.join(args.expdir, 'char_vocab.pickle')) max_word_len = max([len(x) for x in input_vocab.GetWords()]) + 2 print('max word len {0}'.format(max_word_len)) with open(os.path.join(args.expdir, 'model_params.json'), 'r') as f: model_params = json.load(f) c2v = Char2Vec(char_vocab, model_params, max_sequence_len=max_word_len) the_words, word_lengths = c2v.MakeMat(input_vocab, pad_len=max_word_len) saver = tf.train.Saver(tf.all_variables()) session = tf.Session(config=config) saver.restore(session, os.path.join(args.expdir, 'model.bin'))
batch_size = 25 dataset = Dataset(batch_size, preshuffle=mode == 'train') und_symbol = 'und' dataset.ReadData(args.data, mode, args.model) # Make the input vocabulary (words that appear in data) if baseline: # The baseline is to use fixed word embeddings. if mode == 'train': # The input vocab is fixed during training. input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=2) input_vocab.Save(os.path.join(args.expdir, 'input_vocab.pickle')) else: # During testing we need to load the saved input vocab. input_vocab = Vocab.Load( os.path.join(args.expdir, 'input_vocab.pickle')) else: # The open vocabulary can be regenerated with each run. min_count = 1 if mode == 'debug': min_count = 10 # When visualizing word embeddings hide rare words maxlens = {'word': 40, 'char': 150, 'tweet': 40} input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=min_count, max_length=maxlens[args.model]) if mode == 'train': # Make the character vocabulary if args.start: shutil.copyfile(os.path.join(args.start, 'char_vocab.pickle'), os.path.join(args.expdir, 'char_vocab.pickle'))