Пример #1
0
def load_w2v(file, type='txt', header=True, check_zero=True):
    """
    Load word embedding original file
    * file [str]: load file/path
    * type [str]: use 'bin'/'txt' to load '.bin'/'.txt' file
    * check_zero [bool]: check whether the first line is a zero vector (need type='txt')
    - w2v [word_vector]: word_vector class
    """
    if type == 'bin':
        dot.start("* Load word embedding")
        from gensim.models.keyedvectors import KeyedVectors
        w2v = KeyedVectors.load_word2vec_format(file, binary=True)
        dot.stop()
    elif type == 'txt':
        with cs.open(file) as fobj:
            line = fobj.readline().rstrip()
            if header:
                num_word, vector_size = map(int, line.split())
                line = fobj.readline().rstrip()
            else:
                vector_size = len(line.split(' ')) - 1

            if check_zero:
                first_vec = list(map(float, line.split(' ')[1:]))
                add_zero = True if any(first_vec) else False
            else:
                add_zero = False

            w2v = word_vector(vector_size, add_zero)
            if header:
                for _ in bar(num_word, "* Load word embedding"):
                    line = line.rstrip().split(' ')
                    word, vector = line[0], np.array(line[1:], dtype=float)
                    w2v[word] = vector
                    line = fobj.readline()
            else:
                dot.start("* Load word embedding")
                while line:
                    line = line.rstrip().split(' ')
                    word, vector = line[0], np.array(line[1:], dtype=float)
                    w2v[word] = vector
                    line = fobj.readline()
                dot.stop()
    else:
        raise ValueError("Value error of 'type', want 'txt'/'bin', get '{}'.".format(type))
    print("- Word embedding size:", vector_size)
    return w2v
Пример #2
0
def save_dict(_dict, file, desc=None, line_split=False, code='utf-8'):
    """
    Save a dict
    * _dict [dict]: dict for saving
    * file [str]: save file/path
    * desc [str]: a description string
    * line_split [bool]: each line contains one element
    * code [str]: encoding
    """
    if desc:
        dot.start("* Save {}".format(desc))
    with cs.open(file, 'w', code) as outobj:
        if line_split:
            for key, value in _dict.items():
                outobj.write(str(key) + '\t' + json.dumps(value) + '\n')
        else:
            json.dump(_dict, outobj)
    if desc:
        dot.stop()
    return 1
Пример #3
0
def save_list(_list, file, desc=None, line_split=False, code='utf-8'):
    """
    Save a list
    * _list [list]: list for saving
    * file [str]: save file/path
    * desc [str]: a description string
    * line_split [bool]: each line contains one element
    * code [str]: encoding
    """
    if desc:
        dot.start("* Save {}".format(desc))
    with cs.open(file, 'w', code) as outobj:
        if line_split:
            for ele in _list:
                outobj.write(json.dumps(ele) + '\n')
        else:
            json.dump(_list, outobj)
    if desc:
        dot.stop()
    return 1
Пример #4
0
def load_list(file, desc=None, line_split=False, code='utf-8'):
    """
    Load a file to list
    * file [str]: load file/path
    * desc [str]: a description string
    * line_split [bool]: each line contains one element
    * code [str]: encoding
    - _list [list]: result list
    """
    if desc:
        dot.start("* Load {}".format(desc))
    _list = []
    with cs.open(file, 'r', code) as inobj:
        if line_split:
            for line in inobj:
                ele = json.loads(line)
                _list.append(ele)
        else:
            _list = json.load(inobj)
    if desc:
        dot.stop()
    return _list
Пример #5
0
def load_dict(file, desc=None, line_split=False, code='utf-8'):
    """
    Load a file to dict
    * file [str]: load file/path
    * desc [str]: a description string
    * line_split [bool]: each line contains one element
    * code [str]: encoding
    - _dict [dict]: result dict
    """
    if desc:
        dot.start("* Load {}".format(desc))
    _dict = {}
    with cs.open(file, 'r', code) as inobj:
        if line_split:
            for line in inobj:
                key, value = line.split('\t', 1)
                value = json.loads(value)
                _dict[key] = value
        else:
            _dict = json.load(inobj)
    if desc:
        dot.stop()
    return _dict