parser = argparse.ArgumentParser( description= 'Write the list of words in embeddings but not in dict vocabulary') parser.add_argument('embeddings', type=str) parser.add_argument('vocabulary', type=str) parser.add_argument('vocabulary_counts', type=str) parser.add_argument('absent_words', type=str) args = parser.parse_args() print "read first file {}".format(args.embeddings) embeddings = read_embedding_file(args.embeddings) print "read vocabulary file {}".format(args.vocabulary) vocabulary = Vocabulary(args.vocabulary) print "read vocabulary for counts estimation file {}".format( args.vocabulary_counts) vocabulary_counts = Vocabulary(args.vocabulary_counts) vocabulary = set(vocabulary.words) # faster lookup absent_in_vocab = set( [w for w in embeddings.keys() if w not in vocabulary]) print("Number of absent words in vocab", len(absent_in_vocab)) absent_in_vocab = sorted(list(absent_in_vocab), key=lambda w: vocabulary_counts.word_freq(w), reverse=True) with open(args.absent_words, 'w') as f: for w in absent_in_vocab: f.write(w.encode('utf8') + '\n')
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--target_coverage_text", type=float, help="Target coverage of text") parser.add_argument("--target_coverage_def", type=float, help="Target coverage of def") parser.add_argument("--vocab_text", type=str, help="Vocabulary of text") parser.add_argument("--vocab_def", type=str, help="Vocabulary of def") parser.add_argument("--step_size", type=int, default=30) parser.add_argument("--target", type=str, default="Final path") args = parser.parse_args() vocab_text = Vocabulary(args.vocab_text) vocab_def = Vocabulary(args.vocab_def) # Greedy solution is optimal # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big target_coverage_text = np.sum( vocab_text.frequencies) * args.target_coverage_text target_coverage_def = np.sum( vocab_def.frequencies) * args.target_coverage_def current_vocab = set([]) # Of course I could use binsearch for id in range(vocab_def.size() / args.step_size): for id2 in range(args.step_size): current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2)) current_vocab_mod = set(current_vocab) current_coverage_def = 0.0 current_coverage_text = 0.0 for w in current_vocab_mod: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[ vocab_text.word_to_id(w)] id_text = 0 while current_coverage_text < target_coverage_text: while vocab_text.id_to_word(id_text) in current_vocab_mod: id_text += 1 if id_text >= vocab_text.size(): raise Exception("Perhaps try lower target coverage") w = vocab_text.id_to_word(id_text) current_vocab_mod.add(w) current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[id_text] if current_coverage_def > target_coverage_def: current_vocab = current_vocab_mod break print( "After adding {} words I covered {} of def and {} of text occurences" .format( len(current_vocab_mod), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) # To be safe rechecking shortlist works current_coverage_def = 0 current_coverage_text = 0 for w in current_vocab: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)] current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id( w)] print( "Sanity check: after adding {} words I covered {} of def and {} of text occurences" .format(len(current_vocab), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) vocab_result = Vocabulary.build( {word: vocab_text.word_freq(word) for word in current_vocab}) vocab_result.save(args.target)
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--top-k", type=int, help="Top most frequent words to leave") parser.add_argument( "--vocab-text", default=None, help="Vocab corresponding to the main if text is a dictionary.") parser.add_argument( "--weight-dict-entries", action='store_true', help="Weight dict entries according to the freqs from a vocab.") parser.add_argument( "--exclude-top-k", type=int, help="Ignore definitions of a number of most frequent words") parser.add_argument( "text", help= "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well." ) parser.add_argument("vocab", help="Destination") args = parser.parse_args() text = [] if args.vocab_text: text = collections.defaultdict(int) vocab_text = Vocabulary(args.vocab_text) for f_name in args.text.split(","): logging.info("Processing " + f_name) if f_name.endswith('.h5'): with h5py.File(f_name) as h5_file: if 'text' not in h5_file.keys(): print("Missing text field from " + f_name) text.extend(h5_file['text'][:]) elif f_name.endswith('.json'): logging.info( "Will build the vocabulary from definitions in a dictionary") dict_ = json.load(open(f_name, "r")) for word, list_defs in dict_.items(): text_vocab_id = vocab_text.word_to_id(word) if (text_vocab_id != vocab_text.unk and text_vocab_id < args.exclude_top_k): continue for def_ in list_defs: for def_word in def_: if args.weight_dict_entries: text[def_word] += vocab_text.word_freq(word) else: text[def_word] += 1 else: with open(f_name) as file_: def data(): for line in file_: for word in line.strip().split(): try: yield text_type(word, 'utf-8') except: print("Skipped word " + word) text.extend(data()) logging.info("{} words".format(len(text))) vocab = Vocabulary.build(text, args.top_k) vocab.save(args.vocab)