def from_file(cls, file_path, embed_dim, vocab_size=None): """Load word embeddings. Args: file_path (str) embed_dim (int): expected embed_dim vocab_size (int): max # of words in the vocab. If not specified, uses all available vectors in file. """ if vocab_size is None: vocab_size = num_lines(file_path) words = [] embeds = [] with codecs.open(file_path, 'r', encoding='utf-8') as f: lines = verboserate( f, desc='Loading embeddings from {}'.format(file_path), total=vocab_size) for i, line in enumerate(lines): if i == vocab_size: break tokens = line.split() word, embed = tokens[0], np.array( [float(tok) for tok in tokens[1:]], dtype=np.float32) if len(embed) != embed_dim: raise ValueError('expected {} dims, got {} dims'.format( embed_dim, len(embed))) words.append(word) embeds.append(embed) vocab = SimpleVocab(words) embed_matrix = np.stack(embeds) embed_matrix = embed_matrix.astype(np.float32) assert embed_matrix.shape == (vocab_size, embed_dim) return cls(embed_matrix, vocab)
def examples_from_file(path, seq_length_limit): """Return list[EditExample] from file path.""" examples = [] # count total lines before loading total_lines = num_lines(path) with codecs.open(path, 'r', encoding='utf-8') as f: lnum = 0 for line in verboserate(f, desc='Reading data file.', total=total_lines): split = line.strip().split('\t') lnum += 1 input_words = [] try: for c in config.source_cols: input_words.append(split[c].split(' ')) trg_words = split[config.target_col].split( ' ') # gold answer assert len(trg_words) > 0 ex = EditExample(input_words, trg_words) # skip sequences that are too long, because they use up memory if max_seq_length(ex) > seq_length_limit: continue examples.append(ex) except: print 'bad formatting in line ' + str(lnum) print line return examples
def file_rows(path, limit): with codecs.open(path, encoding='utf-8') as f: reader = unicode_csv_reader(f, delimiter='\t') for i, row in verboserate(enumerate(reader), total=num_lines(path)): if i == 0: continue # skip header if i > limit: break yield row
def examples_from_file(path): """Return list[list[unicode]] from file path.""" examples = [] # count total lines before loading total_lines = num_lines(path) with codecs.open(path, 'r', encoding='utf-8') as f: for line in verboserate(f, desc='Reading data file.', total=total_lines): ex = line.strip().lower() ex_words = ex.split(' ') assert len(ex_words) > 0 examples.append(ex_words) return examples
def load_sentences(path, limit): """Load sentences. Args: path (str) limit (int) Returns: list[list[unicode]]: a list of sentences (each a sequence of words) """ with codecs.open(path, 'r', encoding='utf-8') as f: sentences = [] total_lines = min(num_lines(path), limit) lines = verboserate(f, desc='Loading sentences', total=total_lines) for i, line in enumerate(lines): if i == limit: break sentences.append(line.lower().strip().split()) return sentences