def test_count_vocab_items_respects_casing(self): indexer = SingleIdTokenIndexer("words") counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 1, "Hello": 1} indexer = SingleIdTokenIndexer("words", lowercase_tokens=True) counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 2}
def test_as_array_produces_token_sequence(self): indexer = SingleIdTokenIndexer("words") padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {}) assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(), token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__() self._tokenizer = tokenizer self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def __init__(self, lazy: bool = False, paper_features_path: str = None, word_splitter: WordSplitter = None, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, data_file: Optional[str] = None, samples_per_query: int = 5, margin_fraction: float = 0.5, ratio_hard_negatives: float = 0.5, predict_mode: bool = False, max_num_authors: Optional[int] = 5, ratio_training_samples: Optional[float] = None, max_sequence_length: Optional[int] = -1, cache_path: Optional[str] = None, overwrite_cache: Optional[bool] = False, use_cls_token: Optional[bool] = None, concat_title_abstract: Optional[bool] = None, coviews_file: Optional[str] = None, included_text_fields: Optional[str] = None, use_paper_feature_cache: bool = True) -> None: """ Args: lazy: if false returns a list paper_features_path: path to the paper features json file (result of scripts.generate_paper_features.py candidates_path: path to the candidate papers tokenizer: tokenizer to be used for tokenizing strings token_indexers: token indexer for indexing vocab data_file: path to the data file (e.g, citations) samples_per_query: number of triplets to generate for each query margin_fraction: minimum margin of co-views between positive and negative samples ratio_hard_negatives: ratio of training data that is selected from hard negatives remaining is allocated to easy negatives. should be set to 1.0 in case of similar click data predict_mode: if `True` the model only considers the current paper and returns an embedding otherwise the model uses the triplet format to train the embedder author_id_embedder: Embedder for author ids s2_id_embedder: Embedder for respresenting s2 ids other_id_embedder: Embedder for representing other ids (e.g., id assigned by metadata) max_num_authors: maximum number of authors, ratio_training_samples: Limits training to proportion of all training instances max_sequence_length: Longer sequences would be truncated (if -1 then there would be no truncation) cache_path: Path to file to cache instances, if None, instances won't be cached. If specified, instances are cached after being created so next time they are not created again from scratch overwrite_cache: If true, it overwrites the cached files. Each file corresponds to all instances created from the train, dev or test set. use_cls_token: Like bert, use an additional CLS token in the begginning (for transoformer) concat_title_abstract: Whether to consider title and abstract as a single field. coviews_file: Only for backward compatibility to work with older models (renamed to `data_file` in newer models), leave this empty as it won't have any effect included_text_fields: space delimited fields to concat to the title: e.g., `title abstract authors` use_paper_feature_cache: set to False to disable the in-memory cache of paper features """ super().__init__(lazy) self._word_splitter = word_splitter or SimpleWordSplitter() self._tokenizer = tokenizer or WordTokenizer(self._word_splitter) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._token_indexer_author_id = { "tokens": SingleIdTokenIndexer(namespace='author') } self._token_indexer_author_position = \ {"tokens": SingleIdTokenIndexer(namespace='author_positions')} self._token_indexer_venue = { "tokens": SingleIdTokenIndexer(namespace='venue') } self._token_indexer_id = { "tokens": SingleIdTokenIndexer(namespace='id') } with open(paper_features_path) as f_in: self.papers = json.load(f_in) self.samples_per_query = samples_per_query self.margin_fraction = margin_fraction self.ratio_hard_negatives = ratio_hard_negatives self.predict_mode = predict_mode self.max_sequence_length = max_sequence_length self.use_cls_token = use_cls_token if data_file and not predict_mode: # logger.info(f'reading contents of the file at: {coviews_file}') with open(data_file) as f_in: self.dataset = json.load(f_in) # logger.info(f'reading complete. Total {len(self.dataset)} records found.') root_path, _ = os.path.splitext(data_file) # for multitask interleaving reader, track which dataset the instance is coming from self.data_source = root_path.split('/')[-1] else: self.dataset = None self.data_source = None self.max_num_authors = max_num_authors self.triplet_generator = TripletGenerator( paper_ids=list(self.papers.keys()), coviews=self.dataset, margin_fraction=margin_fraction, samples_per_query=samples_per_query, ratio_hard_negatives=ratio_hard_negatives) self.paper_feature_cache = { } # paper_id -> paper features. Serves as a cache for the _get_paper_features function self.ratio_training_samples = float( ratio_training_samples) if ratio_training_samples else None self.cache_path = cache_path self.overwrite_cache = overwrite_cache self.data_file = data_file self.paper_features_path = paper_features_path self.ratio_training_samples = ratio_training_samples self.concat_title_abstract = concat_title_abstract self.included_text_fields = set(included_text_fields.split()) self.use_paper_feature_cache = use_paper_feature_cache self.abstract_delimiter = [Token('[SEP]')] self.author_delimiter = [Token('[unused0]')]
def test_as_array_produces_token_sequence(self): indexer = SingleIdTokenIndexer("words") padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {}) assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
def __init__( self, lazy: bool = False, sample: int = -1, lf_syntax: str = None, replace_world_entities: bool = False, align_world_extractions: bool = False, gold_world_extractions: bool = False, tagger_only: bool = False, denotation_only: bool = False, world_extraction_model: Optional[str] = None, skip_attributes_regex: Optional[str] = None, entity_bits_mode: Optional[str] = None, entity_types: Optional[List[str]] = None, lexical_cues: List[str] = None, tokenizer: Tokenizer = None, question_token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or WordTokenizer() self._question_token_indexers = question_token_indexers or { "tokens": SingleIdTokenIndexer() } self._entity_token_indexers = self._question_token_indexers self._sample = sample self._replace_world_entities = replace_world_entities self._lf_syntax = lf_syntax self._entity_bits_mode = entity_bits_mode self._align_world_extractions = align_world_extractions self._gold_world_extractions = gold_world_extractions self._entity_types = entity_types self._tagger_only = tagger_only self._denotation_only = denotation_only self._skip_attributes_regex = None if skip_attributes_regex is not None: self._skip_attributes_regex = re.compile(skip_attributes_regex) self._lexical_cues = lexical_cues # Recording of entities in categories relevant for tagging all_entities = {} all_entities["world"] = ["world1", "world2"] # TODO: Clarify this into an appropriate parameter self._collapse_tags = ["world"] self._all_entities = None if entity_types is not None: if self._entity_bits_mode == "collapsed": self._all_entities = entity_types else: self._all_entities = [e for t in entity_types for e in all_entities[t]] logger.info(f"all_entities = {self._all_entities}") # Base world, depending on LF syntax only self._knowledge_graph = KnowledgeGraph( entities={"placeholder"}, neighbors={}, entity_text={"placeholder": "placeholder"} ) self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax) # Decide dynamic entities, if any self._dynamic_entities: Dict[str, str] = dict() self._use_attr_entities = False if "_attr_entities" in lf_syntax: self._use_attr_entities = True qr_coeff_sets = self._world.qr_coeff_sets for qset in qr_coeff_sets: for attribute in qset: if ( self._skip_attributes_regex is not None and self._skip_attributes_regex.search(attribute) ): continue # Get text associated with each entity, both from entity identifier and # associated lexical cues, if any entity_strings = [words_from_entity_string(attribute).lower()] if self._lexical_cues is not None: for key in self._lexical_cues: if attribute in LEXICAL_CUES[key]: entity_strings += LEXICAL_CUES[key][attribute] self._dynamic_entities["a:" + attribute] = " ".join(entity_strings) # Update world to include dynamic entities if self._use_attr_entities: logger.info(f"dynamic_entities = {self._dynamic_entities}") neighbors: Dict[str, List[str]] = {key: [] for key in self._dynamic_entities} self._knowledge_graph = KnowledgeGraph( entities=set(self._dynamic_entities.keys()), neighbors=neighbors, entity_text=self._dynamic_entities, ) self._world = QuarelWorld(self._knowledge_graph, self._lf_syntax) self._stemmer = PorterStemmer().stemmer self._world_tagger_extractor = None self._extract_worlds = False if world_extraction_model is not None: logger.info("Loading world tagger model...") self._extract_worlds = True self._world_tagger_extractor = WorldTaggerExtractor(world_extraction_model) logger.info("Done loading world tagger model!") # Convenience regex for recognizing attributes self._attr_regex = re.compile(r"""\((\w+) (high|low|higher|lower)""")
def test_from_params_valid_vocab_extension_thoroughly(self): ''' Tests for Valid Vocab Extension thoroughly: Vocab extension is valid when overlapping namespaces have same padding behaviour (padded/non-padded) Summary of namespace paddings in this test: original_vocab namespaces tokens0 padded tokens1 non-padded tokens2 padded tokens3 non-padded instances namespaces tokens0 padded tokens1 non-padded tokens4 padded tokens5 non-padded TypicalExtention example: (of tokens1 namespace) -> original_vocab index2token apple #0->apple bat #1->bat cat #2->cat -> Token to be extended with: cat, an, apple, banana, atom, bat -> extended_vocab: index2token apple #0->apple bat #1->bat cat #2->cat an #3->an atom #4->atom banana #5->banana ''' vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"]) original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2 original_vocab.add_token_to_namespace("bat", namespace="tokens0") # index:3 original_vocab.add_token_to_namespace("cat", namespace="tokens0") # index:4 original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0 original_vocab.add_token_to_namespace("bat", namespace="tokens1") # index:1 original_vocab.add_token_to_namespace("cat", namespace="tokens1") # index:2 original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0 original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1 original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2 original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0 original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1 original_vocab.save_to_files(vocab_dir) text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens0": SingleIdTokenIndexer("tokens0")}) text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]], {"tokens4": SingleIdTokenIndexer("tokens4")}) text_field5 = TextField([Token(t) for t in ["x", "y", "z"]], {"tokens5": SingleIdTokenIndexer("tokens5")}) instances = Batch([Instance({"text0": text_field0, "text1": text_field1, "text4": text_field4, "text5": text_field5})]) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens5"]}) extended_vocab = Vocabulary.from_params(params, instances) # namespaces: tokens0, tokens1 is common. # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances extended_namespaces = {*extended_vocab._token_to_index} assert extended_namespaces == {"tokens{}".format(i) for i in range(6)} # # Check that _non_padded_namespaces list is consistent after extension assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"} # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping assert extended_vocab.get_vocab_size("tokens1") == 6 assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded # namespace tokens3, tokens4 was only in original_vocab, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2") assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3") # namespace tokens2 was only in instances, # and its token count should be same in extended_vocab assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z # Word2index mapping of all words in all namespaces of original_vocab # should be maintained in extended_vocab for namespace, token2index in original_vocab._token_to_index.items(): for token, _ in token2index.items(): vocab_index = original_vocab.get_token_index(token, namespace) extended_vocab_index = extended_vocab.get_token_index(token, namespace) assert vocab_index == extended_vocab_index # And same for Index2Word mapping for namespace, index2token in original_vocab._index_to_token.items(): for index, _ in index2token.items(): vocab_token = original_vocab.get_token_from_index(index, namespace) extended_vocab_token = extended_vocab.get_token_from_index(index, namespace) assert vocab_token == extended_vocab_token
def __init__(self, tokenizer=None, token_indexers=None, lazy=False): super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() }
def main(): args = parse_args() params = Params.from_file(args.params) save_dir = Path(args.save) save_dir.mkdir(parents=True) params.to_file(save_dir / 'params.json') train_params, model_params = params.pop('train'), params.pop('model') random_seed = train_params.pop_int('random_seed', 2019) torch.manual_seed(random_seed) random.seed(random_seed) log_filename = save_dir / 'stdout.log' sys.stdout = TeeLogger(filename=log_filename, terminal=sys.stdout, file_friendly_terminal_output=False) sys.stderr = TeeLogger(filename=log_filename, terminal=sys.stderr, file_friendly_terminal_output=False) tokenizer = WordTokenizer( start_tokens=['<s>'], end_tokens=['</s>'], ) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) dataset_reader = SnliReader(tokenizer=tokenizer, token_indexers={'tokens': token_indexer}) train_labeled_dataset_path = train_params.pop('train_labeled_dataset_path') train_unlabeled_dataset_path = train_params.pop( 'train_unlabeled_dataset_path', None) train_labeled_dataset = dataset_reader.read(train_labeled_dataset_path) train_labeled_dataset = filter_dataset_by_length( dataset=train_labeled_dataset, max_length=30) if train_unlabeled_dataset_path is not None: train_unlabeled_dataset = dataset_reader.read( train_unlabeled_dataset_path) train_unlabeled_dataset = filter_dataset_by_length( dataset=train_unlabeled_dataset, max_length=30) else: train_unlabeled_dataset = [] valid_dataset = dataset_reader.read(train_params.pop('valid_dataset_path')) vocab = Vocabulary.from_instances( instances=train_labeled_dataset + train_unlabeled_dataset, max_vocab_size=train_params.pop_int('max_vocab_size', None)) vocab.save_to_files(save_dir / 'vocab') labeled_batch_size = train_params.pop_int('labeled_batch_size') unlabeled_batch_size = train_params.pop_int('unlabeled_batch_size') labeled_iterator = BasicIterator(batch_size=labeled_batch_size) unlabeled_iterator = BasicIterator(batch_size=unlabeled_batch_size) labeled_iterator.index_with(vocab) unlabeled_iterator.index_with(vocab) if not train_unlabeled_dataset: unlabeled_iterator = None model = SeparatedSNLIModel(params=model_params, vocab=vocab) optimizer = optim.Adam(params=model.parameters()) summary_writer = SummaryWriter(log_dir=save_dir / 'log') trainer = SeparatedLVMTrainer( model=model, optimizer=optimizer, labeled_iterator=labeled_iterator, unlabeled_iterator=unlabeled_iterator, train_labeled_dataset=train_labeled_dataset, train_unlabeled_dataset=train_unlabeled_dataset, validation_dataset=valid_dataset, summary_writer=summary_writer, serialization_dir=save_dir, num_epochs=train_params.pop('num_epochs', 50), iters_per_epoch=len(train_labeled_dataset) // labeled_batch_size, write_summary_every=100, validate_every=2000, patience=2, clip_grad_max_norm=5, cuda_device=train_params.pop_int('cuda_device', 0)) trainer.train()
def __init__(self): self.tokenizer = WordTokenizer() self.token_indexers = {"tokens": SingleIdTokenIndexer()}
def __init__(self, max_length: int): super().__init__() self.token_indexers = {'tokens': SingleIdTokenIndexer()} self.max_length = max_length
return Instance(fields) @overrides def _read(self, file_path: str) -> Iterator[Instance]: df = pd.read_csv(file_path) if config.testing: df = df.head(1000) for i, row in df.iterrows(): yield self.text_to_instance( [Token(x) for x in self.tokenizer(row["comment_text"])], row["id"], row[label_cols].values, ) token_indexer = SingleIdTokenIndexer() def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False). split_words(x)[:config.max_seq_len] ] reader = JigsawDatasetReader(tokenizer=tokenizer, token_indexers={"tokens": token_indexer}) train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"]) val_ds = None
def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field)
def main(): token_indexer = SingleIdTokenIndexer() reader = JigsawDatasetReader( tokenizer=custom_tokenizer(), token_indexers={"tokens": token_indexer}, ) # Kaggle的多标签“恶意评论分类挑战 dataset_root = Path('../../data/jigsaw') train_dataset, dev_dataset = (reader.read( dataset_root / fname) for fname in ["train.csv", "test_proced.csv"]) print( f"total train samples: {len(train_dataset)}, dev samples: {len(dev_dataset)}" ) # 建立词汇表,从数据集中建立 vocab = Vocabulary.from_instances(train_dataset + dev_dataset) vocab_dim = vocab.get_vocab_size('tokens') print("vocab: ", vocab.get_vocab_size('labels'), vocab_dim) # 构建网络,此处网络为lstm-linear embedding_dim = 300 hidden_dim = 128 token_embedding = Embedding(num_embeddings=vocab_dim, embedding_dim=embedding_dim) word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)) model = MultiLabelClassifier(word_embeddings, 0.5, encoder, 0.2, len(label_cols), vocab) # allennlp 目前好像不支持单机多卡,或者支持性能不好 gpu_id = 0 if torch.cuda.is_available() else -1 if gpu_id > -1: model.cuda(gpu_id) # 构建迭代器,并为迭代器指定vocab iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) # --------------------- forward demo ---------------------- # generator = iter(iterator(train_dataset, shuffle=True)) # for _ in range(5): # batch = next(generator) # print('---\nbatch ', batch.keys(), batch['tokens'].keys(), batch['tokens']['tokens'].shape, batch['label'].shape) # [batch, sentence_len, token_len] # batch = move_to_device(batch, gpu_id) # tokens = batch['tokens'] # # # option1. forward one step by one # mask = get_text_field_mask(tokens) # embeddings = model.word_embeddings(tokens) # print("embeddings: ", embeddings.shape) # state = model.encoder(embeddings, mask) # class_logits = model.linear(state) # # print("lstm state: ", state.shape, class_logits.shape) # # # option2. do forward on the model # y = model(**batch) # metric = model.get_metrics() # print("model out: ", y, '\n', metric) # --------------------- train --------------------- optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=dev_dataset, # serialization_dir="./models/", cuda_device=gpu_id, patience=10, num_epochs=20) trainer.train()
def build_tasks(args): '''Main logic for preparing tasks, doing so by 1) creating / loading the tasks 2) building / loading the vocabulary 3) building / loading the word vectors 4) indexing each task's data 5) initializing lazy loaders (streaming iterators) ''' # 1) create / load tasks tasks, train_task_names, eval_task_names = \ get_tasks(parse_task_list_arg(args.train_tasks), parse_task_list_arg(args.eval_tasks), args.max_seq_len, path=args.data_dir, scratch_path=args.exp_dir, load_pkl=bool(not args.reload_tasks), nli_prob_probe_path=args['nli-prob'].probe_path, max_targ_v_size=args.max_targ_word_v_size) for task in tasks: task_classifier = config.get_task_attr(args, task.name, "use_classifier") setattr(task, "_classifier_name", task_classifier if task_classifier else task.name) # 2) build / load vocab and indexers indexers = {} if not args.word_embs == 'none': indexers["words"] = SingleIdTokenIndexer() if args.elmo: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.openai_transformer: assert not indexers, ("OpenAI transformer is not supported alongside" " other indexers due to tokenization!") indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") # Exit if any tasks are not compatible with this tokenization. for task in tasks: assert task.tokenizer_name == "OpenAI.BPE", \ (f"Task '{task.name:s}' not compatible with OpenAI " "Transformer model. For edge probing, use -openai versions.") vocab_path = os.path.join(args.exp_dir, 'vocab') if args.reload_vocab or not os.path.exists(vocab_path): _build_vocab(args, tasks, vocab_path) # Always load vocab from file. vocab = Vocabulary.from_files(vocab_path) log.info("\tLoaded vocab from %s", vocab_path) for namespace, mapping in vocab._index_to_token.items(): log.info("\tVocab namespace %s: size %d", namespace, len(mapping)) log.info("\tFinished building vocab.") args.max_word_v_size = vocab.get_vocab_size('tokens') args.max_char_v_size = vocab.get_vocab_size('chars') # 3) build / load word vectors word_embs = None if args.word_embs != 'none': emb_file = os.path.join(args.exp_dir, 'embs.pkl') if args.reload_vocab or not os.path.exists(emb_file): word_embs = _build_embeddings(args, vocab, emb_file) else: # load from file word_embs = pkl.load(open(emb_file, 'rb')) log.info("Trimmed word embeddings: %s", str(word_embs.size())) # 4) Index tasks using vocab (if preprocessed copy not available). preproc_dir = os.path.join(args.exp_dir, "preproc") utils.maybe_make_dir(preproc_dir) reindex_tasks = parse_task_list_arg(args.reindex_tasks) utils.assert_for_log( not (args.reload_indexing and not reindex_tasks), "Flag reload_indexing was set, but no tasks are set to reindex (use -o \"args.reindex_tasks = \"task1,task2,...\"\")" ) for task in tasks: force_reindex = (args.reload_indexing and task.name in reindex_tasks) for split in ALL_SPLITS: log_prefix = "\tTask '%s', split '%s'" % (task.name, split) relative_path = _get_serialized_record_path( task.name, split, "preproc") cache_found = _find_cached_file(args.exp_dir, args.global_ro_exp_dir, relative_path, log_prefix=log_prefix) if force_reindex or not cache_found: # Re-index from scratch. record_file = _get_serialized_record_path( task.name, split, preproc_dir) if os.path.exists(record_file) and os.path.islink(record_file): os.remove(record_file) _index_split(task, split, indexers, vocab, record_file) # Delete in-memory data - we'll lazy-load from disk later. # TODO: delete task.{split}_data_text as well? task.train_data = None task.val_data = None task.test_data = None log.info("\tTask '%s': cleared in-memory data.", task.name) log.info("\tFinished indexing tasks") # 5) Initialize tasks with data iterators. assert not (args.training_data_fraction < 1 and args.eval_data_fraction < 1), \ "training_data_fraction and eval_data_fraction could not be used at a same time (could not be < 1 together)" train_tasks = [] eval_tasks = [] for task in tasks: # Replace lists of instances with lazy generators from disk. task.val_data = _get_instance_generator(task.name, "val", preproc_dir) task.test_data = _get_instance_generator(task.name, "test", preproc_dir) # When using training_data_fraction, we need modified iterators for use # only on training datasets at pretraining time. if args.training_data_fraction < 1 and task.name in train_task_names: log.info("Creating trimmed pretraining-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.training_data_fraction) train_tasks.append(task) if task.name in eval_task_names: # Rebuild the iterator so we see the full dataset in the eval training # phase. It will create a deepcopy of the task object # and therefore there could be two tasks with the same name (task.name). log.info("Creating un-trimmed eval training version of " + task.name + " train.") log.warn( "When using un-trimmed eval training version of train split, " "it creates a deepcopy of task object which is inefficient." ) task = copy.deepcopy(task) task.train_data = _get_instance_generator(task.name, "train", preproc_dir, fraction=1.0) eval_tasks.append(task) # When using eval_data_fraction, we need modified iterators # only for training datasets at train_for_eval time. elif args.eval_data_fraction < 1 and task.name in eval_task_names: log.info("Creating trimmed train-for-eval-only version of " + task.name + " train.") task.train_data = _get_instance_generator( task.name, "train", preproc_dir, fraction=args.eval_data_fraction) eval_tasks.append(task) if task.name in train_task_names: # Rebuild the iterator so we see the full dataset in the pretraining # phase. It will create a deepcopy of the task object # and therefore there could be two tasks with the same name (task.name). log.info("Creating un-trimmed pretraining version of " + task.name + " train.") log.warn( "When using un-trimmed pretraining version of train split, " "it creates a deepcopy of task object which is inefficient." ) task = copy.deepcopy(task) task.train_data = _get_instance_generator(task.name, "train", preproc_dir, fraction=1.0) train_tasks.append(task) # When neither eval_data_fraction nor training_data_fraction is specified # we use unmodified iterators. else: task.train_data = _get_instance_generator(task.name, "train", preproc_dir, fraction=1.0) if task.name in train_task_names: train_tasks.append(task) if task.name in eval_task_names: eval_tasks.append(task) log.info("\tLazy-loading indexed data for task='%s' from %s", task.name, preproc_dir) log.info("All tasks initialized with data iterators.") log.info('\t Training on %s', ', '.join(train_task_names)) log.info('\t Evaluating on %s', ', '.join(eval_task_names)) return train_tasks, eval_tasks, vocab, word_embs
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._tokenizer = WordTokenizer()
def __init__(self, token_indexers=None, lazy=False): super(CcgBankDatasetReader, self).__init__(lazy=lazy) self._token_indexers = token_indexers or { u'tokens': SingleIdTokenIndexer() }
brief Authors: panxu([email protected]) Date: 2018/12/18 09:39:00 """ from allennlp.data.fields import TextField from allennlp.data import Instance from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer from allennlp.data import Token words = ["All", "the", "cool", "kids", "use", "character", "embeddings", "."] sentence1 = TextField(tokens=[Token(w) for w in words], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='token_ids'), 'characters': TokenCharactersIndexer(namespace='token_characters') }) words2 = ["I", "prefer", "word2vec", "though", "..."] sentence2 = TextField(tokens=[Token(w) for w in words2], token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='token_ids'), 'characters': TokenCharactersIndexer(namespace='token_characters') }) instance1 = Instance({'sentence': sentence1}) instance2 = Instance({'sentence': sentence2})
from allennlp.data import Batch, Instance, Token, Vocabulary from allennlp.data.dataset_readers.dataset_utils.span_utils import enumerate_spans from allennlp.data.fields import TextField, ListField, SpanField from allennlp.data.token_indexers import SingleIdTokenIndexer from allennlp.modules.span_extractors import EndpointSpanExtractor from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding # Create an instance with multiple spans tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas', '.'] tokens = [Token(token) for token in tokens] token_indexers = {'tokens': SingleIdTokenIndexer()} text_field = TextField(tokens, token_indexers=token_indexers) spans = [(2, 3), (5, 6)] # ('an', 'elephant') and ('my', 'pajamas) span_fields = ListField( [SpanField(start, end, text_field) for start, end in spans]) instance = Instance({'tokens': text_field, 'spans': span_fields}) # Alternatively, you can also enumerate all spans spans = enumerate_spans(tokens, max_span_width=3) print('all spans up to length 3:') print(spans) def filter_function(span_tokens): return not any(t == Token('.') for t in span_tokens) spans = enumerate_spans(tokens,
def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField(["This", "is", "a", "sentence", "."], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths()
target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({ 'source_tokens': source_field, 'target_tokens': target_field }) else: return Instance({'source_tokens': source_field}) @overrides def apply_token_indexers(self, instance: Instance) -> None: instance.fields[ 'source_tokens']._token_indexers = self.source_token_indexers instance.fields[ 'target_tokens']._token_indexers = self._target_token_indexers if __name__ == '__main__': dataset_reader = Seq2SeqDatasetReader( source_token_indexers={ "tokens": SingleIdTokenIndexer(namespace='source_tokens') }, target_token_indexers={ "tokens": SingleIdTokenIndexer(namespace='target_tokens') }) instances = list(dataset_reader.read('./data/reverse/train.csv')) print(instances[0])
def setUp(self): super(TestSpanField, self).setUp() self.text = TextField([ Token(t) for t in [u"here", u"is", u"a", u"sentence", u"for", u"spans", u"."] ], {u"words": SingleIdTokenIndexer(u"words")})
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) # train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') #origin source train_data = reader.read('train.txt') #local reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}) # dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') #orgin dev_data = reader.read('dev.txt') #local # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": # embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" #origin embedding_path = "crawl-300d-2M.vec.zip" #local weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # intiialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def setUp(self): super(TestSequenceLabelField, self).setUp() self.text = TextField( [Token(t) for t in ["here", "are", "some", "words", "."]], {"words": SingleIdTokenIndexer("words")})
def setUp(self): super(BidirectionalAttentionFlowTest, self).setUp() constants.GLOVE_PATH = 'tests/fixtures/glove.6B.100d.sample.txt.gz' reader_params = Params({ 'token_indexers': { 'tokens': { 'type': 'single_id' }, 'token_characters': { 'type': 'characters' } } }) dataset = SquadReader.from_params(reader_params).read( 'tests/fixtures/data/squad.json') vocab = Vocabulary.from_dataset(dataset) self.vocab = vocab dataset.index_instances(vocab) self.dataset = dataset self.token_indexers = { 'tokens': SingleIdTokenIndexer(), 'token_characters': TokenCharactersIndexer() } self.model = BidirectionalAttentionFlow.from_params( self.vocab, Params({})) small_params = Params({ 'text_field_embedder': { 'tokens': { 'type': 'embedding', 'pretrained_file': constants.GLOVE_PATH, 'trainable': False, 'projection_dim': 4 }, 'token_characters': { 'type': 'character_encoding', 'embedding': { 'embedding_dim': 8 }, 'encoder': { 'type': 'cnn', 'embedding_dim': 8, 'num_filters': 4, 'ngram_filter_sizes': [5] } } }, 'phrase_layer': { 'type': 'lstm', 'bidirectional': True, 'input_size': 8, 'hidden_size': 4, 'num_layers': 1, }, 'similarity_function': { 'type': 'linear', 'combination': 'x,y,x*y', 'tensor_1_dim': 8, 'tensor_2_dim': 8 }, 'modeling_layer': { 'type': 'lstm', 'bidirectional': True, 'input_size': 32, 'hidden_size': 4, 'num_layers': 1, }, 'span_end_encoder': { 'type': 'lstm', 'bidirectional': True, 'input_size': 56, 'hidden_size': 4, 'num_layers': 1, }, }) self.small_model = BidirectionalAttentionFlow.from_params( self.vocab, small_params)
def main(): parser = argparse.ArgumentParser(description='Evidence Inference experiments') parser.add_argument('--cuda_device', type=int, default=0, help='GPU number (default: 0)') parser.add_argument('--epochs', type=int, default=2, help='upper epoch limit (default: 2)') parser.add_argument('--patience', type=int, default=1, help='trainer patience (default: 1)') parser.add_argument('--batch_size', type=int, default=32, help='batch size (default: 32)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout for the model (default: 0.2)') parser.add_argument('--emb_size', type=int, default=256, help='elmo embeddings size (default: 256)') parser.add_argument('--model_name', type=str, default='baseline', help='model name (default: baseline)') parser.add_argument('--tunable', action='store_true', help='tune the underlying embedding model (default: False)') args = parser.parse_args() annotations = pd.read_csv('data/data/annotations_merged.csv') prompts = pd.read_csv('data/data/prompts_merged.csv') feature_dictionary = {} prompts_dictionary = {} for index, row in prompts.iterrows(): prompts_dictionary[row['PromptID']] = [row['Outcome'], row['Intervention'], row['Comparator']] for index, row in annotations.iterrows(): if row['PMCID'] not in feature_dictionary: feature_dictionary[row['PMCID']] = [] feature_dictionary[row['PMCID']].append([row['Annotations'], row['Label']] + prompts_dictionary[row['PromptID']]) train = [] valid = [] test = [] with open('data/splits/train_article_ids.txt') as train_file: for line in train_file: train.append(int(line.strip())) with open('data/splits/validation_article_ids.txt') as valid_file: for line in valid_file: valid.append(int(line.strip())) with open('data/splits/test_article_ids.txt') as test_file: for line in test_file: test.append(int(line.strip())) elmo_token_indexer = {'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer()} reader = EIDatasetReader(elmo_token_indexer, feature_dictionary) train_data = reader.read(train) valid_data = reader.read(valid) test_data = reader.read(test) vocab = Vocabulary.from_instances(train_data + valid_data + test_data) urls = [ 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_options.json', 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_' '2xhighway_weights.hdf5' ] elmo_token_embedding = ElmoTokenEmbedder(urls[0], urls[1], dropout=args.dropout, requires_grad=args.tunable, projection_dim=args.emb_size) word_embeddings = BasicTextFieldEmbedder({'elmo': elmo_token_embedding}, allow_unmatched_keys=True) model = Baseline(word_embeddings, vocab) cuda_device = args.cuda_device if torch.cuda.is_available(): model = model.cuda(cuda_device) else: cuda_device = -1 optimizer = torch.optim.Adam(model.parameters(), lr=0.001) iterator = BucketIterator(batch_size=args.batch_size, sorting_keys=[('article', 'num_tokens')], padding_noise=0.1) iterator.index_with(vocab) serialization_dir = 'model_checkpoints/' + args.model_name trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=test_data, patience=args.patience, validation_metric='+accuracy', num_epochs=args.epochs, cuda_device=cuda_device, serialization_dir=serialization_dir) result = trainer.train() for key in result: print(str(key) + ': ' + str(result[key])) test_metrics = evaluate(trainer.model, test_data, iterator, cuda_device=cuda_device, batch_weight_key="") print('Test Data statistics:') for key, value in test_metrics.items(): print(str(key) + ': ' + str(value))
def _get_default_indexer() -> SingleIdTokenIndexer: return SingleIdTokenIndexer(namespace='tokens', start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL])
def test_valid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' extension_ways = ["from_params", "extend_from_instances"] # Test: padded/non-padded common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("d", namespace="tokens") original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.add_token_to_namespace("b", namespace="tokens") text_field = TextField([Token(t) for t in ["a", "d", "c", "e"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) extra_count = 2 if extended_vocab.is_padded("tokens") else 0 assert extended_vocab.get_token_index("d", "tokens") == 0 + extra_count assert extended_vocab.get_token_index("a", "tokens") == 1 + extra_count assert extended_vocab.get_token_index("b", "tokens") == 2 + extra_count assert extended_vocab.get_token_index("c", "tokens") # should be present assert extended_vocab.get_token_index("e", "tokens") # should be present assert extended_vocab.get_vocab_size("tokens") == 5 + extra_count # Test: padded/non-padded non-common namespaces are extending appropriately non_padded_namespaces_list = [[], ["tokens1"], ["tokens1", "tokens2"]] for non_padded_namespaces in non_padded_namespaces_list: original_vocab = Vocabulary(non_padded_namespaces=non_padded_namespaces) original_vocab.add_token_to_namespace("a", namespace="tokens1") # index2 text_field = TextField([Token(t) for t in ["b"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text": text_field})]) for way in extension_ways: if way == "extend_from_instances": extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": non_padded_namespaces}) extended_vocab.extend_from_instances(params, instances) else: shutil.rmtree(vocab_dir, ignore_errors=True) original_vocab.save_to_files(vocab_dir) params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": non_padded_namespaces}) extended_vocab = Vocabulary.from_params(params, instances) # Should have two namespaces assert len(extended_vocab._token_to_index) == 2 extra_count = 2 if extended_vocab.is_padded("tokens1") else 0 assert extended_vocab.get_vocab_size("tokens1") == 1 + extra_count extra_count = 2 if extended_vocab.is_padded("tokens2") else 0 assert extended_vocab.get_vocab_size("tokens2") == 1 + extra_count
def setUp(self): super(TestTagField, self).setUp() self.text = TextField(["here", "are", "some", "words", "."], {"words": SingleIdTokenIndexer("words")})
def train(train, validation, optimizer_name): batch_size = 32 learning_rate = 0.01 max_iterations = 100 token_indexer = { "tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3), } reader = Conll2003DatasetReader(token_indexer) train_dataset = reader.read(train) validation_dataset = reader.read(validation) # Once we've read in the datasets, we use them to create our <code>Vocabulary</code> # (that is, the mapping[s] from tokens / labels to ids). vocab = Vocabulary.from_instances(train_dataset + validation_dataset) # Set variables model = get_model(vocab) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) else: cuda_device = -1 if optimizer_name == 'adahessian': optimizer = Adahessian(model.parameters(), lr=learning_rate, block_length=2) elif optimizer_name == 'ranger': optimizer = Ranger(model.parameters(), lr=learning_rate) else: raise AttributeError() train_dataset.index_with(vocab) validation_dataset.index_with(vocab) scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, factor=0.5, patience=4, mode="min", verbose=True) dl = PyTorchDataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, ) dl_validation = PyTorchDataLoader(validation_dataset, batch_size=batch_size, shuffle=False) trainer_model = AdaTrainer trainer = trainer_model( model=model, optimizer=optimizer, # iterator=iterator, grad_norm=10.0, data_loader=dl, validation_data_loader=dl_validation, learning_rate_scheduler=scheduler, patience=8, num_epochs=max_iterations, cuda_device=cuda_device, ) train_metrics = trainer.train() print(train_metrics)
def build_tasks(args): '''Prepare tasks''' def parse_tasks(task_list): '''parse string of tasks''' if task_list == 'all': tasks = ALL_TASKS elif task_list == 'none': tasks = [] else: tasks = task_list.split(',') return tasks train_task_names = parse_tasks(args.train_tasks) eval_task_names = parse_tasks(args.eval_tasks) all_task_names = list(set(train_task_names + eval_task_names)) tasks = get_tasks(all_task_names, args.max_seq_len, args.load_tasks) max_v_sizes = {'word': args.max_word_v_size} token_indexer = {} if args.elmo: token_indexer["elmo"] = ELMoTokenCharactersIndexer("elmo") if not args.elmo_no_glove: token_indexer["words"] = SingleIdTokenIndexer() else: token_indexer["words"] = SingleIdTokenIndexer() vocab_path = os.path.join(args.exp_dir, 'vocab') preproc_file = os.path.join(args.exp_dir, args.preproc_file) if args.load_preproc and os.path.exists(preproc_file): preproc = pkl.load(open(preproc_file, 'rb')) vocab = Vocabulary.from_files(vocab_path) word_embs = preproc['word_embs'] for task in tasks: train, val, test = preproc[task.name] task.train_data = train task.val_data = val task.test_data = test log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens')) log.info("\tLoaded data from %s", preproc_file) else: log.info("\tProcessing tasks from scratch") word2freq = get_words(tasks) vocab = get_vocab(word2freq, max_v_sizes) word_embs = get_embeddings(vocab, args.word_embs_file, args.d_word) preproc = {'word_embs': word_embs} for task in tasks: train, val, test = process_task(task, token_indexer, vocab) task.train_data = train task.val_data = val task.test_data = test del_field_tokens(task) preproc[task.name] = (train, val, test) log.info("\tFinished indexing tasks") pkl.dump(preproc, open(preproc_file, 'wb')) vocab.save_to_files(vocab_path) log.info("\tSaved data to %s", preproc_file) del word2freq del preproc train_tasks = [task for task in tasks if task.name in train_task_names] eval_tasks = [task for task in tasks if task.name in eval_task_names] log.info('\t Training on %s', ', '.join([task.name for task in train_tasks])) log.info('\t Evaluating on %s', ', '.join([task.name for task in eval_tasks])) return train_tasks, eval_tasks, vocab, word_embs