示例#1
0
    def from_file(cls, file_path, embed_dim, vocab_size=None):
        """Load word embeddings.

        Args:
            file_path (str)
            embed_dim (int): expected embed_dim
            vocab_size (int): max # of words in the vocab. If not specified, uses all available vectors in file.
        """
        if vocab_size is None:
            vocab_size = num_lines(file_path)

        words = []
        embeds = []
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            lines = verboserate(
                f,
                desc='Loading embeddings from {}'.format(file_path),
                total=vocab_size)
            for i, line in enumerate(lines):
                if i == vocab_size: break
                tokens = line.split()
                word, embed = tokens[0], np.array(
                    [float(tok) for tok in tokens[1:]], dtype=np.float32)
                if len(embed) != embed_dim:
                    raise ValueError('expected {} dims, got {} dims'.format(
                        embed_dim, len(embed)))
                words.append(word)
                embeds.append(embed)

        vocab = SimpleVocab(words)
        embed_matrix = np.stack(embeds)
        embed_matrix = embed_matrix.astype(np.float32)
        assert embed_matrix.shape == (vocab_size, embed_dim)
        return cls(embed_matrix, vocab)
        def examples_from_file(path, seq_length_limit):
            """Return list[EditExample] from file path."""
            examples = []

            # count total lines before loading
            total_lines = num_lines(path)

            with codecs.open(path, 'r', encoding='utf-8') as f:
                lnum = 0
                for line in verboserate(f,
                                        desc='Reading data file.',
                                        total=total_lines):
                    split = line.strip().split('\t')
                    lnum += 1
                    input_words = []
                    try:
                        for c in config.source_cols:
                            input_words.append(split[c].split(' '))
                        trg_words = split[config.target_col].split(
                            ' ')  # gold answer
                        assert len(trg_words) > 0
                        ex = EditExample(input_words, trg_words)
                        # skip sequences that are too long, because they use up memory
                        if max_seq_length(ex) > seq_length_limit:
                            continue
                        examples.append(ex)
                    except:
                        print 'bad formatting in line ' + str(lnum)
                        print line

            return examples
示例#3
0
def file_rows(path, limit):
    with codecs.open(path, encoding='utf-8') as f:
        reader = unicode_csv_reader(f, delimiter='\t')
        for i, row in verboserate(enumerate(reader), total=num_lines(path)):
            if i == 0:
                continue  # skip header
            if i > limit:
                break
            yield row
示例#4
0
        def examples_from_file(path):
            """Return list[list[unicode]] from file path."""
            examples = []

            # count total lines before loading
            total_lines = num_lines(path)

            with codecs.open(path, 'r', encoding='utf-8') as f:
                for line in verboserate(f, desc='Reading data file.', total=total_lines):
                    ex = line.strip().lower()
                    ex_words = ex.split(' ')
                    assert len(ex_words) > 0
                    examples.append(ex_words)

            return examples
示例#5
0
def load_sentences(path, limit):
    """Load sentences.
    
    Args:
        path (str)
        limit (int)

    Returns:
        list[list[unicode]]: a list of sentences (each a sequence of words)
    """
    with codecs.open(path, 'r', encoding='utf-8') as f:
        sentences = []
        total_lines = min(num_lines(path), limit)
        lines = verboserate(f, desc='Loading sentences', total=total_lines)
        for i, line in enumerate(lines):
            if i == limit:
                break
            sentences.append(line.lower().strip().split())
    return sentences