def evaluate(embedding_model: str = None, feature='concat', add_relative: bool = False, add_pair2vec: bool = False): model = get_word_embedding_model(embedding_model) model_pair = [] if add_relative: model_pair.append( get_word_embedding_model( 'relative_init.{}'.format(embedding_model))) if add_pair2vec: model_pair.append(get_word_embedding_model('pair2vec')) data = get_lexical_relation_data() report = [] for data_name, v in data.items(): logging.info('train model with {} on {}'.format( embedding_model, data_name)) label_dict = v.pop('label') # preprocess data oov = {} dataset = {} for _k, _v in v.items(): x = [diff(a, b, model, feature, model_pair) for (a, b) in _v['x']] dim = len([_x for _x in x if _x is not None][0]) # initialize zero vector for OOV dataset[_k] = [[ _x if _x is not None else np.zeros(dim) for _x in x ], _v['y']] oov[_k] = sum([_x is None for _x in x]) shared_config = { 'model': embedding_model, 'feature': feature, 'add_relative': add_relative, 'add_pair2vec': add_pair2vec, 'label_size': len(label_dict), 'data': data_name, 'oov': oov } # grid serach if 'val' not in dataset: evaluator = Evaluate(dataset, shared_config, default_config=True) tmp_report = evaluator(0) else: pool = Pool() evaluator = Evaluate(dataset, shared_config) tmp_report = pool.map(evaluator, evaluator.config_indices) pool.close() tmp_report = [tmp_report ] if type(tmp_report) is not list else tmp_report report += tmp_report # print(report) # print(pd.DataFrame(report)) # input() del model del model_pair return report
def get_relative_init(output_path: str, context_word_dict: Dict, minimum_frequency_context: int, word_embedding_type: str = 'fasttext'): """ Get RELATIVE vectors """ logging.info("loading embeddings") word_embedding_model = get_word_embedding_model(word_embedding_type) line_count = 0 with open(output_path + '.tmp', 'w', encoding='utf-8') as txt_file: for token_i, tokens_paired in tqdm(context_word_dict.items()): for token_j in tokens_paired: vector_pair = 0 cont_pair = 0 for token_co in context_word_dict[token_i][token_j]: freq = context_word_dict[token_i][token_j][token_co] if freq < minimum_frequency_context: continue try: tmp = token_co.replace('_', ' ') token_co_vector = word_embedding_model[tmp] vector_pair += (freq * token_co_vector) cont_pair += 1 except Exception: pass if cont_pair != 0: vector_pair = vector_pair/cont_pair txt_file.write('__'.join([token_i, token_j])) for y in vector_pair: txt_file.write(' ' + str(y)) txt_file.write("\n") line_count += 1 logging.info("reformat file to add header") logging.info("\t * {} lines, {} dim".format(line_count, word_embedding_model.vector_size)) with open(output_path, 'w') as f_out: f_out.write(str(line_count) + " " + str(word_embedding_model.vector_size) + "\n") with open(output_path + '.tmp', 'r') as f_cache: for line in f_cache: f_out.write(line)
import pickle from util import get_word_embedding_model model = get_word_embedding_model('glove') vocab_glove = set(model.vocab.keys()) model = get_word_embedding_model('w2v') vocab_w2v = set(model.vocab.keys()) del model vocab = vocab_glove.intersection(vocab_w2v) vocab = sorted(list(vocab)) with open('./common_word.pkl', "wb") as fp: pickle.dump(vocab, fp)
datefmt='%Y-%m-%d %H:%M:%S') full_data = get_analogy_data() def cap(_list): return [t.capitalize() for t in _list] if __name__ == '__main__': # full_data['bats_cap'] = full_data['bats'] for i in ['bats_cap', 'bats', 'sat', 'u2', 'u4', 'google']: val, test = full_data[i] for data in [test, val]: for model_type in ['fasttext', 'glove', 'w2v']: model = get_word_embedding_model(model_type) if i == 'bats_cap': _pred = [ get_prediction_we(cap(o['stem']), [cap(m) for m in o['choice']], model, 'diff') for o in test ] else: _pred = [ get_prediction_we(o['stem'], o['choice'], model, 'diff') for o in data ] for d, p in zip(data, _pred): d['pred/{}'.format(model_type)] = p with open('../results/analogy.prediction.json', 'w') as f:
parser.add_argument('--model', help='word embedding model', type=str, default="fasttext") parser.add_argument('--truecase', help='Truecasing', action='store_true') return parser.parse_args() def tc(string): return truecase.get_true_case('A ' + string)[2:] if __name__ == '__main__': opt = get_options() model_word = get_word_embedding_model(opt.model) cache = '{}/relative_init.{}.txt'.format(opt.output_dir, opt.model) relative_model = 'relative_init.{}'.format(opt.model) if opt.truecase: cache = '{}/relative_init.{}.truecase.txt'.format( opt.output_dir, opt.model) relative_model += '.truecase' model = get_word_embedding_model(relative_model) logging.info("concat with word embedding model") cache_concat = cache.replace('.txt', '.concat.txt') cache_concat_bin = cache_concat.replace('.txt', '.bin') if os.path.exists(cache_concat): os.remove(cache_concat)
def test_analogy(model_type, add_relative: bool = False, add_pair2vec: bool = False, bi_direction: bool = False, only_pair_embedding: bool = False): model_re = None model_p2v = None if only_pair_embedding: model = None else: model = get_word_embedding_model(model_type) if add_relative: model_re = get_word_embedding_model( 'relative_init.{}'.format(model_type)) if add_pair2vec: model_p2v = get_word_embedding_model('pair2vec') if only_pair_embedding: assert model_p2v or model_re else: assert model if only_pair_embedding: pattern = ['concat'] else: pattern = ['diff', 'concat', ('diff', 'dot'), ('concat', 'dot')] results = [] for _pattern in pattern: for i, (val, test) in full_data.items(): tmp_result = { 'data': i, 'model': model_type, 'add_relative': add_relative, 'add_pair2vec': add_pair2vec, 'bi_direction': bi_direction, 'only_pair_embedding': only_pair_embedding } for prefix, data in zip(['test', 'valid'], [test, val]): _pred = [ get_prediction_we(o['stem'], o['choice'], model, _pattern, relative_model=model_re, pair2vec_model=model_p2v, bi_direction=bi_direction) for o in data ] tmp_result['oov_{}'.format(prefix)] = len( [p for p in _pred if p is None]) # random prediction when OOV occurs _pred = [ p if p is not None else data[n]['pred/pmi'] for n, p in enumerate(_pred) ] accuracy = sum( [o['answer'] == _pred[n] for n, o in enumerate(data)]) / len(_pred) tmp_result['accuracy_{}'.format(prefix)] = accuracy tmp_result['accuracy'] = (tmp_result['accuracy_test'] * len(test) + tmp_result['accuracy_valid'] * len(val)) / (len(val) + len(test)) tmp_result['feature'] = _pattern results.append(tmp_result) return results
def test_analogy(model_type): model = get_word_embedding_model(model_type) analogy_result = model.evaluate_word_analogies( datapath('questions-words.txt')) return {'model_type': model_type, 'accuracy': analogy_result[0]}