def __init__(self, x_max_lengths=None, y_max_length=8): self.name = 'wikipedia' self.x_max_length = max(x_max_lengths) self.x_max_lengths = x_max_lengths self.y_max_length = y_max_length # Vocabulary as defined in two dictionaries # self.vocabulary | token -> idx # self.reversed_vocabulary | idx -> token self.vocabulary = self.read_vocabulary() self.vocab_size = len(self.vocabulary) self.reversed_vocabulary = dict( zip(self.vocabulary.values(), self.vocabulary.keys())) self.test = WikipediaTestData(self, path=directory('/data/wikipedia_data/') + 'test.txt', x_max_length=self.x_max_length, y_max_length=self.y_max_length) self.train = WikipediaBucketedTrainData( self, paths=[ directory('/data/wikipedia_data/') + 'train-33.txt', directory('/data/wikipedia_data/') + 'train-62.txt', directory('/data/wikipedia_data/') + 'train-118.txt', directory('/data/wikipedia_data/') + 'train-504.txt' ], x_max_lengths=self.x_max_lengths, y_max_length=self.y_max_length)
def setupMainWindow(self): self.setWindowTitle("Classifier") self.documents_directory = utils.directory(config[KEY_DOCUMENTS]) self.documents = self.documents_directory.get_all_files() self.categories_directory = utils.directory(config[KEY_CATEGORIES]) self.categories = self.categories_directory.get_all_directories() QtGui.QShortcut(QtGui.QKeySequence("Ctrl+Z"), self, self.undoAction)
def wikipedia(n, should_filter_pretrained=False): d = WikipediaDataset(x_max_lengths=[33, 62, 118, 504], y_max_length=8) ep = EmbeddingProcessor(d.vocabulary, path=directory('/data/compvec_wikipedia/')) d.store_dataset(path=directory('/data/compvec_wikipedia/')) if should_filter_pretrained: filter_pretrained(ep) runs(n, d, ep, neural=True, random=False)
def single_wordnet(n, should_filter_pretrained=False): d = WordnetDataset(x_max_length=32, y_max_length=1) ep = EmbeddingProcessor(d.vocabulary, path=directory('/data/compvec_wordnet_single/')) d.store_dataset(path=directory('/data/compvec_wordnet_single/')) if should_filter_pretrained: filter_pretrained(ep) runs(n, d, ep, yc=False, neural=False, random=False) runs(n, d, ep, yc=True, neural=False, random=False)
def multi_wordnet(n, should_filter_pretrained=False): d = WordnetDataset( test_data_path=directory('/data/compvec_wordnet_single/') + 'test_data.gz', x_max_length=32, y_max_length=6) ep = EmbeddingProcessor(d.vocabulary, path=directory('/data/compvec_wordnet_multi/')) d.store_dataset(path=directory('/data/compvec_wordnet_multi/')) if should_filter_pretrained: filter_pretrained(ep) runs(n, d, ep, neural=True, random=True)
def __init__(self, run_group_name, dataset, embedding_processor, yc=True, pretraining=None, learning_rate=1e-3, batch_size=512, embedding_size=300, stop_gradients_y_n=False, dropout_keep_p=0.75, margin=0.25, composition='sum', loss='mse', refine_after_x_steps=0, no=None): self.dataset = dataset self.embedding_processor = embedding_processor self.y_composition = yc self.no = no self.learning_rate = learning_rate self.batch_size = batch_size self.dropout_keep_p = dropout_keep_p self.pretraining = pretraining self.refine_after_x_steps = refine_after_x_steps self.run_dir = directory('/out/run-%s' % run_group_name, ['logs', 'tsne', 'embeddings', 'output']) self.data_dir = embedding_processor.path self.x_max_buckets = None if hasattr(self.dataset, 'x_max_lengths'): self.x_max_buckets = self.dataset.x_max_lengths self.graph = tf.Graph() with self.graph.as_default(): # Model self.model = Model(embedding_size=embedding_size, x_max_buckets=self.x_max_buckets, x_max_length=self.dataset.x_max_length, y_max_length=self.dataset.y_max_length, y_composition=self.y_composition, vocab_size=self.dataset.vocab_size, margin=margin, composition=composition, loss=loss, stop_gradients_y_n=stop_gradients_y_n) # Evaluator self.evaluation = Evaluation(self.model, self.dataset) # Assign Tensorflow Operations self.assign_ops() # Setup Writers for Tensorboard self.setup_writers()
def deploy_file(path, kwargs, config): with utils.directory(os.path.dirname(path)): config.update(kwargs) if 'FunctionName' not in config: clip.exit('You must provide a function name', err=True) # Zip up directory utils.make_zip(config['FunctionName']) # Upload! upload(config['FunctionName'], config)
def deploy_dir(path, kwargs): with utils.directory(path): config = LambdaConfig().load_from_cwd().update_config(kwargs) config.verify() # Remove ignore paths for e in config.get('ignore', []) + ['.git/', '.gitignore']: utils.delete_resource(e) # Run install command if 'install' in config: utils.shell(config.get('install')) upload(config.get_config())
def favicon(): ''' Serves GET favicon requests Favicons are the icons that show up in your browser's tab. For the purposes of this project, a favicon is not needed, so a blank one is used. Note: This function is required because browsers will try to access '{base_url}/favicon.ico' and without this function, the request will automatically go to the date_page() function where things will get messed up. ''' return send_from_directory(utils.directory(APP.root_path), 'favicon.ico')
def __init__(self, model, dataset): self.m = model self.d = dataset path = directory('/data/wordnetsingle/') + 'test_data.gz' self.compveceval_test = WordnetData.from_path(path, self.d.vocabulary, self.d.x_max_length, self.d.y_max_length) self.compveceval = CompVecEvalEvaluation(self.m, self.compveceval_test) self.senteval = SentEvalEvaluation(self.m, self.d) self.wordsim = WordSimEvaluation(self.m, self.d)
def __init__(self, vocabulary=None, path=None): if vocabulary is None: self.vocabulary = dict() else: self.vocabulary = vocabulary if path is None: self.path = directory('/data/compositional_wordnet') else: self.path = path self.reversed_vocabulary = dict( zip(self.vocabulary.values(), self.vocabulary.keys()))
def run(tasks, dir=None, silent=False): if dir is None: dir = os.getcwd() okapi.silent = silent with utils.directory(dir): config = utils.load_config() okapi.log('Running tasks on {}'.format(config.project if hasattr( config, 'project') else os.path.basename(dir))) setattr(config, 'ok', okapi) for task in tasks: if not hasattr(config, task): okapi.log('"{}" not a valid task, skipping!'.format(task)) continue okapi.run(getattr(config, task)) okapi.log('All tasks complete!')
def deploy_dir(path, kwargs): with utils.directory(path): config = utils.load_config() config['config'].update(kwargs) if 'FunctionName' not in config['config']: clip.exit('You must provide a function name', err=True) # Remove ignore paths for e in config['ignore'] + ['.git/', '.gitignore']: utils.delete_resource(e) # Run install command if 'install' in config: utils.shell(config['install']) # Zip up directory utils.make_zip(config['config']['FunctionName']) # Upload! params = config['config'] upload(params['FunctionName'], params)
def read_fallback_embeddings(vocabulary, pretrain=None, evaluation='MR'): original_embedding_path = original_embedding_file(pretrain) if original_embedding_path is None: return dict() ped = EmbeddingProcessor(vocabulary) processed_embeddings_path = '%s/%s-%s.vec.gz' % (directory('/data/senteval_embeddings'), pretrain, evaluation) if not os.path.isfile(processed_embeddings_path): ped.process_pretrained_embeddings(input_filename=original_embedding_path, output_filename=processed_embeddings_path) embeddings = ped.read_embeddings(processed_embeddings_path) return embeddings
def read_vocabulary(self, path=None, vocab_min_frequency=18, vocab_size_limit=5171164): """ Read vocabulary from vocab file. """ vocab_ns = dict() if path is None: path = directory('/data/wikipedia_data/') + 'vocab.txt' with open(path) as f: for i, line in enumerate(f): if i % 10000 == 0: sys.stdout.write("\rReading vocabulary… %6.2f%%" % ((100 * i) / float(VOCAB_SIZE), )) s = line.split() if len(s) != 2: continue t, n = s[0], int(s[1]) if n >= vocab_min_frequency and i < vocab_size_limit - 2: vocab_ns[t] = n else: break # vocabulary file is already sorted print("\rVocabulary read %d" % (len(vocab_ns) + 2)) # Sort the vocabulary by frequency, frequent words on top vocab_ns = sorted(vocab_ns.items(), key=operator.itemgetter(1), reverse=True) vocabulary = {token: i for i, (token, count) in enumerate(vocab_ns, 2)} vocabulary['PAD'] = PAD_SYMBOL vocabulary['UNK'] = UNK_SYMBOL return vocabulary
parser.add_argument('data_dir') parser.add_argument('-a', '--annotator', default='default') parser.add_argument('-i', '--interface', default='interface') parser.add_argument('--past_reports', default=False, action='store_true') parser.add_argument('-m', '--models', action='append') parser.add_argument('-d', '--device', default='cpu') parser.add_argument('-r', '--reload', default=False, action='store_true') parser.add_argument('-p', '--port') args = parser.parse_args() if args.data_dir is None: raise NotImplementedError startup['annotations_dir'] = join(args.data_dir, args.annotator + '_annotations') if not exists(startup['annotations_dir']): mkdir(startup['annotations_dir']) np.random.seed(0) startup['file_generator'] = FileGenerator(args.data_dir, startup['annotations_dir'], reload=args.reload) try: startup['file'] = next(startup['file_generator']) except StopIteration: startup['file'] = None with directory(args.interface_dir): exec('import ' + args.interface) models_to_load = args.models if args.models is not None else [] startup['interface'] = eval(args.interface).FullModelInterface( models_to_load=models_to_load, device=args.device) startup['include_past_reports'] = args.past_reports app.run(debug=True, port=args.port)
def root(self, path): with utils.directory(path): yield
def deploy_file(path, kwargs): with utils.directory(os.path.dirname(path)): config = LambdaConfig().load_from_front_matter(path).update_config( kwargs) config.verify() upload(config.get_config())
def deploy_file(path, kwargs): with utils.directory(os.path.dirname(path)): config = LambdaConfig().load_from_front_matter(path).update_config(kwargs) config.verify() upload(config.get_config())