Пример #1
0
    def __init__(self, expdir):
        self.expdir = expdir
        self.params = helper.GetParams(
            os.path.join(expdir, 'char_vocab.pickle'), 'eval', expdir)
        self.char_vocab = Vocab.Load(os.path.join(expdir, 'char_vocab.pickle'))
        self.user_vocab = Vocab.Load(os.path.join(expdir, 'user_vocab.pickle'))
        self.params.vocab_size = len(self.char_vocab)
        self.params.user_vocab_size = len(self.user_vocab)

        # construct the tensorflow graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.model = Model(self.params, training_mode=False)
            self.char_tensor = tf.constant(self.char_vocab.GetWords(),
                                           name='char_tensor')
            self.beam_chars = tf.nn.embedding_lookup(self.char_tensor,
                                                     self.model.selected)
Пример #2
0
if args.mode in ('train', 'eval', 'classify'):
  mode = args.mode
  if args.partition_override:
    mode = 'all'

  dataset = Dataset(max_len=params.max_len + 1, 
                    preshuffle=args.mode=='train',
                    batch_size=params.batch_size)
  print 'reading data'
  dataset.ReadData(args.data, params.context_vars + ['text'],
                   mode=mode, splitter=params.splitter)

if args.mode == 'train':
  if args.vocab is not None:
    vocab = Vocab.Load(args.vocab)
  else:
    min_count = 20
    if hasattr(params, 'min_vocab_count'):
      min_count = params.min_vocab_count
    vocab = Vocab.MakeFromData(dataset.GetColumn('text'), min_count=min_count)
  context_vocabs = {}
  for context_var in params.context_vars:
    v = Vocab.MakeFromData([[u] for u in dataset.GetColumn(context_var)],
                           min_count=50, no_special_syms=True)
    context_vocabs[context_var] = v
    print 'num {0}: {1}'.format(context_var, len(v))
    
  vocab.Save(os.path.join(args.expdir, 'word_vocab.pickle'))
  print 'vocab size {0}'.format(len(vocab))
  with open(os.path.join(args.expdir, 'context_vocab.pickle'), 'wb') as f:
Пример #3
0
"""

parser = argparse.ArgumentParser()
parser.add_argument('--expdir',
                    type=str,
                    help='experiment directory',
                    default='../models/w2v_init')
parser.add_argument('--datadir',
                    type=str,
                    help='where to find the non-community members')
parser.add_argument('--communities',
                    type=str,
                    default='../data/communities.csv.gz',
                    help='csv file to load the community tweets from')
args = parser.parse_args()
vocab = Vocab.Load('../data/vocab.txt')

# load the communities
print 'loading communities'
df = pandas.read_csv(args.communities, dtype={'user': str})

# load all the random people
print 'loading randos'
randos = []
filenames = glob.glob(os.path.join(args.datadir, '*.csv'))
for name in filenames:
    randos.append(
        pandas.read_csv(name,
                        dtype={'user': str},
                        usecols=['text', 'user', 'timestamp']))
randos = pandas.concat(randos)
Пример #4
0
from batcher import Dataset
from char2vec import CharCNN as Char2Vec
from vocab import Vocab

parser = argparse.ArgumentParser()
parser.add_argument('expdir')
args = parser.parse_args()

config = tf.ConfigProto(inter_op_parallelism_threads=10,
                        intra_op_parallelism_threads=10)

dataset = Dataset(10, preshuffle=False)
dataset.ReadData('../data/tweetlid/training.tsv.gz', 'all', 'tweet')

input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=1)
char_vocab = Vocab.Load(os.path.join(args.expdir, 'char_vocab.pickle'))

max_word_len = max([len(x) for x in input_vocab.GetWords()]) + 2
print('max word len {0}'.format(max_word_len))

with open(os.path.join(args.expdir, 'model_params.json'), 'r') as f:
    model_params = json.load(f)

c2v = Char2Vec(char_vocab, model_params, max_sequence_len=max_word_len)
the_words, word_lengths = c2v.MakeMat(input_vocab, pad_len=max_word_len)

saver = tf.train.Saver(tf.all_variables())
session = tf.Session(config=config)

saver.restore(session, os.path.join(args.expdir, 'model.bin'))
Пример #5
0
batch_size = 25
dataset = Dataset(batch_size, preshuffle=mode == 'train')
und_symbol = 'und'

dataset.ReadData(args.data, mode, args.model)

# Make the input vocabulary (words that appear in data)
if baseline:
    # The baseline is to use fixed word embeddings.
    if mode == 'train':
        # The input vocab is fixed during training.
        input_vocab = Vocab.MakeFromData(dataset.GetSentences(), min_count=2)
        input_vocab.Save(os.path.join(args.expdir, 'input_vocab.pickle'))
    else:
        # During testing we need to load the saved input vocab.
        input_vocab = Vocab.Load(
            os.path.join(args.expdir, 'input_vocab.pickle'))
else:
    # The open vocabulary can be regenerated with each run.
    min_count = 1
    if mode == 'debug':
        min_count = 10  # When visualizing word embeddings hide rare words
    maxlens = {'word': 40, 'char': 150, 'tweet': 40}
    input_vocab = Vocab.MakeFromData(dataset.GetSentences(),
                                     min_count=min_count,
                                     max_length=maxlens[args.model])

if mode == 'train':
    # Make the character vocabulary
    if args.start:
        shutil.copyfile(os.path.join(args.start, 'char_vocab.pickle'),
                        os.path.join(args.expdir, 'char_vocab.pickle'))