示例#1
0
def evaluate(embedding_model: str = None,
             feature='concat',
             add_relative: bool = False,
             add_pair2vec: bool = False):
    model = get_word_embedding_model(embedding_model)
    model_pair = []
    if add_relative:
        model_pair.append(
            get_word_embedding_model(
                'relative_init.{}'.format(embedding_model)))
    if add_pair2vec:
        model_pair.append(get_word_embedding_model('pair2vec'))

    data = get_lexical_relation_data()
    report = []
    for data_name, v in data.items():
        logging.info('train model with {} on {}'.format(
            embedding_model, data_name))
        label_dict = v.pop('label')
        # preprocess data
        oov = {}
        dataset = {}
        for _k, _v in v.items():
            x = [diff(a, b, model, feature, model_pair) for (a, b) in _v['x']]
            dim = len([_x for _x in x if _x is not None][0])
            # initialize zero vector for OOV
            dataset[_k] = [[
                _x if _x is not None else np.zeros(dim) for _x in x
            ], _v['y']]
            oov[_k] = sum([_x is None for _x in x])
        shared_config = {
            'model': embedding_model,
            'feature': feature,
            'add_relative': add_relative,
            'add_pair2vec': add_pair2vec,
            'label_size': len(label_dict),
            'data': data_name,
            'oov': oov
        }

        # grid serach
        if 'val' not in dataset:
            evaluator = Evaluate(dataset, shared_config, default_config=True)
            tmp_report = evaluator(0)
        else:
            pool = Pool()
            evaluator = Evaluate(dataset, shared_config)
            tmp_report = pool.map(evaluator, evaluator.config_indices)
            pool.close()
        tmp_report = [tmp_report
                      ] if type(tmp_report) is not list else tmp_report
        report += tmp_report
        # print(report)
        # print(pd.DataFrame(report))
        # input()
    del model
    del model_pair
    return report
示例#2
0
def get_relative_init(output_path: str,
                      context_word_dict: Dict,
                      minimum_frequency_context: int,
                      word_embedding_type: str = 'fasttext'):
    """ Get RELATIVE vectors """
    logging.info("loading embeddings")
    word_embedding_model = get_word_embedding_model(word_embedding_type)

    line_count = 0
    with open(output_path + '.tmp', 'w', encoding='utf-8') as txt_file:
        for token_i, tokens_paired in tqdm(context_word_dict.items()):
            for token_j in tokens_paired:
                vector_pair = 0
                cont_pair = 0
                for token_co in context_word_dict[token_i][token_j]:
                    freq = context_word_dict[token_i][token_j][token_co]
                    if freq < minimum_frequency_context:
                        continue
                    try:
                        tmp = token_co.replace('_', ' ')
                        token_co_vector = word_embedding_model[tmp]
                        vector_pair += (freq * token_co_vector)
                        cont_pair += 1
                    except Exception:
                        pass
                if cont_pair != 0:
                    vector_pair = vector_pair/cont_pair
                    txt_file.write('__'.join([token_i, token_j]))
                    for y in vector_pair:
                        txt_file.write(' ' + str(y))
                    txt_file.write("\n")
                    line_count += 1

    logging.info("reformat file to add header")
    logging.info("\t * {} lines, {} dim".format(line_count, word_embedding_model.vector_size))
    with open(output_path, 'w') as f_out:
        f_out.write(str(line_count) + " " + str(word_embedding_model.vector_size) + "\n")
        with open(output_path + '.tmp', 'r') as f_cache:
            for line in f_cache:
                f_out.write(line)
示例#3
0
import pickle
from util import get_word_embedding_model

model = get_word_embedding_model('glove')
vocab_glove = set(model.vocab.keys())
model = get_word_embedding_model('w2v')
vocab_w2v = set(model.vocab.keys())
del model
vocab = vocab_glove.intersection(vocab_w2v)
vocab = sorted(list(vocab))

with open('./common_word.pkl', "wb") as fp:
    pickle.dump(vocab, fp)
                    datefmt='%Y-%m-%d %H:%M:%S')

full_data = get_analogy_data()


def cap(_list):
    return [t.capitalize() for t in _list]


if __name__ == '__main__':
    # full_data['bats_cap'] = full_data['bats']
    for i in ['bats_cap', 'bats', 'sat', 'u2', 'u4', 'google']:
        val, test = full_data[i]
        for data in [test, val]:
            for model_type in ['fasttext', 'glove', 'w2v']:
                model = get_word_embedding_model(model_type)
                if i == 'bats_cap':
                    _pred = [
                        get_prediction_we(cap(o['stem']),
                                          [cap(m) for m in o['choice']], model,
                                          'diff') for o in test
                    ]
                else:
                    _pred = [
                        get_prediction_we(o['stem'], o['choice'], model,
                                          'diff') for o in data
                    ]
                for d, p in zip(data, _pred):
                    d['pred/{}'.format(model_type)] = p

    with open('../results/analogy.prediction.json', 'w') as f:
示例#5
0
    parser.add_argument('--model',
                        help='word embedding model',
                        type=str,
                        default="fasttext")
    parser.add_argument('--truecase', help='Truecasing', action='store_true')
    return parser.parse_args()


def tc(string):
    return truecase.get_true_case('A ' + string)[2:]


if __name__ == '__main__':
    opt = get_options()

    model_word = get_word_embedding_model(opt.model)

    cache = '{}/relative_init.{}.txt'.format(opt.output_dir, opt.model)
    relative_model = 'relative_init.{}'.format(opt.model)
    if opt.truecase:
        cache = '{}/relative_init.{}.truecase.txt'.format(
            opt.output_dir, opt.model)
        relative_model += '.truecase'

    model = get_word_embedding_model(relative_model)

    logging.info("concat with word embedding model")
    cache_concat = cache.replace('.txt', '.concat.txt')
    cache_concat_bin = cache_concat.replace('.txt', '.bin')
    if os.path.exists(cache_concat):
        os.remove(cache_concat)
示例#6
0
def test_analogy(model_type,
                 add_relative: bool = False,
                 add_pair2vec: bool = False,
                 bi_direction: bool = False,
                 only_pair_embedding: bool = False):

    model_re = None
    model_p2v = None
    if only_pair_embedding:
        model = None
    else:
        model = get_word_embedding_model(model_type)
    if add_relative:
        model_re = get_word_embedding_model(
            'relative_init.{}'.format(model_type))
    if add_pair2vec:
        model_p2v = get_word_embedding_model('pair2vec')
    if only_pair_embedding:
        assert model_p2v or model_re
    else:
        assert model

    if only_pair_embedding:
        pattern = ['concat']
    else:
        pattern = ['diff', 'concat', ('diff', 'dot'), ('concat', 'dot')]
    results = []

    for _pattern in pattern:
        for i, (val, test) in full_data.items():
            tmp_result = {
                'data': i,
                'model': model_type,
                'add_relative': add_relative,
                'add_pair2vec': add_pair2vec,
                'bi_direction': bi_direction,
                'only_pair_embedding': only_pair_embedding
            }
            for prefix, data in zip(['test', 'valid'], [test, val]):
                _pred = [
                    get_prediction_we(o['stem'],
                                      o['choice'],
                                      model,
                                      _pattern,
                                      relative_model=model_re,
                                      pair2vec_model=model_p2v,
                                      bi_direction=bi_direction) for o in data
                ]
                tmp_result['oov_{}'.format(prefix)] = len(
                    [p for p in _pred if p is None])
                # random prediction when OOV occurs
                _pred = [
                    p if p is not None else data[n]['pred/pmi']
                    for n, p in enumerate(_pred)
                ]
                accuracy = sum(
                    [o['answer'] == _pred[n]
                     for n, o in enumerate(data)]) / len(_pred)
                tmp_result['accuracy_{}'.format(prefix)] = accuracy
            tmp_result['accuracy'] = (tmp_result['accuracy_test'] * len(test) +
                                      tmp_result['accuracy_valid'] *
                                      len(val)) / (len(val) + len(test))
            tmp_result['feature'] = _pattern
            results.append(tmp_result)

    return results
def test_analogy(model_type):
    model = get_word_embedding_model(model_type)
    analogy_result = model.evaluate_word_analogies(
        datapath('questions-words.txt'))
    return {'model_type': model_type, 'accuracy': analogy_result[0]}