def test_squad_to_h5py_dataset(): corenlp = None try: port = get_free_port() corenlp = start_corenlp(port) test_dir = tempfile.mkdtemp() json_path = os.path.join(test_dir, 'data.json') h5_path = os.path.join(test_dir, 'data.h5') with open(json_path, 'w') as json_file: print(TEST_SQUAD_RAW_DATA, file=json_file) squad_to_h5py_dataset(json_path, h5_path, "http://localhost:{}".format(port)) with h5py.File(h5_path, 'r') as h5_file: vocab = Vocabulary.build(h5_file['text'], top_k=100) add_words_ids_to_squad(h5_path, vocab) dataset = SQuADDataset(h5_path, ('all', )) stream = dataset.get_example_stream() stream = dataset.apply_default_transformers(stream) example = next(stream.get_epoch_iterator(as_dict=True)) answer_span = slice(example['answer_begins'][0], example['answer_ends'][0]) assert example['questions'].tolist() == map(vocab.word_to_id, [ u'To', u'whom', u'did', u'the', u'Virgin', u'Mary', u'allegedly', u'appear', u'in', u'1858', u'in', u'Lourdes', u'France', u'?' ]) assert example['contexts'][answer_span].tolist() == map( vocab.word_to_id, [u'Saint', u'Bernadette', u'Soubirous']) finally: if corenlp and corenlp.returncode is None: corenlp.kill()
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a vocabulary") parser.add_argument("--top-k", type=int, help="Top most frequent words to leave") parser.add_argument("--keys-only", action='store_true', help="Build vocab of all keys") parser.add_argument("--with-keys", action='store_true', help="Count keys and words in definitions") parser.add_argument("dictionary", help="Input dictionary") parser.add_argument("vocabulary", help="Output vocabulary") args = parser.parse_args() text = [] if args.dictionary.endswith('.json'): text = collections.defaultdict(int) for f_name in args.dictionary.split(","): logging.info("Processing " + f_name) assert (f_name.endswith('.json')) logging.info( "Will build the vocabulary from definitions in a dictionary") dict_ = json.load(open(f_name, "r")) for word, list_defs in dict_.items(): if args.keys_only or args.with_keys: text[word] += 1 if not args.keys_only: for def_ in list_defs: for def_word in def_: text[def_word] += 1 logging.info("{} words".format(len(text))) vocab = Vocabulary.build(text, args.top_k) vocab.save(args.vocabulary)
def main(): parser = argparse.ArgumentParser( "Generate synthetic data and outputs in files") parser.add_argument("path", type=str, help="Top most frequent words to leave") parser.add_argument("n_primes", type=int, help="# of primes") parser.add_argument("n_non_primes", type=int, help="# of non-primes") parser.add_argument("features_size", type=int, help="Features size") parser.add_argument("markov_order", type=int, help="Markov order") parser.add_argument("n_sentences", type=int, help="# sentences") parser.add_argument("pc_train", type=float, help="% train sentences") parser.add_argument("pc_valid", type=float, help="% valid sentences") parser.add_argument("sample_temperature", type=float, default=1.0, help="% valid sentences") parser.add_argument("min_sentence_len", type=int, default=6) parser.add_argument("max_sentence_len", type=int, default=20) parser.add_argument("min_def_len", type=int, default=6) parser.add_argument("max_def_len", type=int, default=20) args = parser.parse_args() print "Number of sentences:", args.n_sentences assert (0 < args.pc_train + args.pc_valid < 1) assert (os.path.exists(args.path) == False) os.makedirs(args.path) args.pc_test = 1 - (args.pc_train + args.pc_valid) gen = FakeTextGenerator(args.n_primes, args.n_non_primes, args.features_size, args.markov_order, args.sample_temperature, args.min_def_len, args.max_def_len) data = gen.create_corpus(args.n_sentences, args.min_sentence_len, args.max_sentence_len, args.pc_train, args.pc_valid) train_data, valid_data, test_data = data concat_sentences = lambda sentences: [' '.join(s) for s in sentences] train_data = concat_sentences(train_data) test_data = concat_sentences(test_data) valid_data = concat_sentences(valid_data) all_data = train_data + valid_data + test_data with temporary_content_path('\n'.join(all_data)) as path: vocab = Vocabulary.build(path, sort_by='lexicographical') vocab.save(os.path.join(args.path, "vocab.txt")) dict_json = json.dumps(gen.dictionary) write_data(os.path.join(args.path, "dict.json"), dict_json) write_data(os.path.join(args.path, "train.txt"), '\n'.join(train_data)) write_data(os.path.join(args.path, "valid.txt"), '\n'.join(valid_data)) write_data(os.path.join(args.path, "test.txt"), '\n'.join(test_data)) args_json = json.dumps(vars(args), indent=4, sort_keys=True) write_data(os.path.join(args.path, "params.json"), args_json) write_data(os.path.join(args.path, "generator.p"), pickle.dumps(gen))
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--target_coverage_text", type=float, help="Target coverage of text") parser.add_argument("--target_coverage_def", type=float, help="Target coverage of def") parser.add_argument("--vocab_text", type=str, help="Vocabulary of text") parser.add_argument("--vocab_def", type=str, help="Vocabulary of def") parser.add_argument("--step_size", type=int, default=30) parser.add_argument("--target", type=str, default="Final path") args = parser.parse_args() vocab_text = Vocabulary(args.vocab_text) vocab_def = Vocabulary(args.vocab_def) # Greedy solution is optimal # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big target_coverage_text = np.sum( vocab_text.frequencies) * args.target_coverage_text target_coverage_def = np.sum( vocab_def.frequencies) * args.target_coverage_def current_vocab = set([]) # Of course I could use binsearch for id in range(vocab_def.size() / args.step_size): for id2 in range(args.step_size): current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2)) current_vocab_mod = set(current_vocab) current_coverage_def = 0.0 current_coverage_text = 0.0 for w in current_vocab_mod: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[ vocab_text.word_to_id(w)] id_text = 0 while current_coverage_text < target_coverage_text: while vocab_text.id_to_word(id_text) in current_vocab_mod: id_text += 1 if id_text >= vocab_text.size(): raise Exception("Perhaps try lower target coverage") w = vocab_text.id_to_word(id_text) current_vocab_mod.add(w) current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[id_text] if current_coverage_def > target_coverage_def: current_vocab = current_vocab_mod break print( "After adding {} words I covered {} of def and {} of text occurences" .format( len(current_vocab_mod), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) # To be safe rechecking shortlist works current_coverage_def = 0 current_coverage_text = 0 for w in current_vocab: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)] current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id( w)] print( "Sanity check: after adding {} words I covered {} of def and {} of text occurences" .format(len(current_vocab), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) vocab_result = Vocabulary.build( {word: vocab_text.word_freq(word) for word in current_vocab}) vocab_result.save(args.target)
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--top-k", type=int, help="Top most frequent words to leave") parser.add_argument( "--vocab-text", default=None, help="Vocab corresponding to the main if text is a dictionary.") parser.add_argument( "--weight-dict-entries", action='store_true', help="Weight dict entries according to the freqs from a vocab.") parser.add_argument( "--exclude-top-k", type=int, help="Ignore definitions of a number of most frequent words") parser.add_argument( "text", help= "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well." ) parser.add_argument("vocab", help="Destination") args = parser.parse_args() text = [] if args.vocab_text: text = collections.defaultdict(int) vocab_text = Vocabulary(args.vocab_text) for f_name in args.text.split(","): logging.info("Processing " + f_name) if f_name.endswith('.h5'): with h5py.File(f_name) as h5_file: if 'text' not in h5_file.keys(): print("Missing text field from " + f_name) text.extend(h5_file['text'][:]) elif f_name.endswith('.json'): logging.info( "Will build the vocabulary from definitions in a dictionary") dict_ = json.load(open(f_name, "r")) for word, list_defs in dict_.items(): text_vocab_id = vocab_text.word_to_id(word) if (text_vocab_id != vocab_text.unk and text_vocab_id < args.exclude_top_k): continue for def_ in list_defs: for def_word in def_: if args.weight_dict_entries: text[def_word] += vocab_text.word_freq(word) else: text[def_word] += 1 else: with open(f_name) as file_: def data(): for line in file_: for word in line.strip().split(): try: yield text_type(word, 'utf-8') except: print("Skipped word " + word) text.extend(data()) logging.info("{} words".format(len(text))) vocab = Vocabulary.build(text, args.top_k) vocab.save(args.vocab)