def __init__(self, input_wvectors, input_word2id, input_id2word, input_vocabulary, pair_file_path, kn_file_name, output_file_name, topn = 20): word2id = dict() with codecs.open(input_word2id, 'r', encoding='utf-8') as f: for lines in f: word2id[lines.strip().split()[0]] = int(lines.strip().split()[1]) id2word = dict() with codecs.open(input_id2word, 'r', encoding='utf-8') as f: for lines in f: id2word[int(lines.strip().split()[0])] = lines.strip().split()[1] vocabulary = [] with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f: for lines in f: vocabulary.append(int(lines.strip())) self.topn = topn kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word) dump_to_pkl(kneighbor, kn_file_name) logging_set('NSselect.log') files = os.listdir(pair_file_path) pairs = dict() for file in tqdm(files): if not os.path.isdir(file): path = pair_file_path + "/" + file pair = load_from_pkl(path) logging.info("pair size: %d" % (len(pair))) if len(pairs) == 0: pairs = pair else: for key in pair.keys(): if key in pairs: pairs[key] += pair[key] else: pairs[key] = pair[key] logging.info("current total pair size: %d" % (len(pairs))) logging.info("start calculate score") score = self.select_new(pairs, kneighbor, self.topn) #score1 = self.select(pairs, kneighbor) logging.info("start saving") dump_to_pkl(score, output_file_name)
if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Philly arguments parser") parser.add_argument('emb_file_name', type=str) parser.add_argument('--similarity_test_paths', type=str, default='data/240.txt|data/297.txt') parser.add_argument('--synset_paths', type=str, default='data/nsem3-adjusted.txt') parser.add_argument('--analogy_test_paths', type=str, default='data/analogy.txt') parser.add_argument('--log_path', type=str, default='evaluation.log') args, _ = parser.parse_known_args() logging_set(args.log_path) if args.similarity_test_paths == 'None': args.similarity_test_paths = None if args.synset_paths == 'None': args.synset_paths = None if args.analogy_test_paths == 'None': args.analogy_test_paths = None best_scores, save_flag = evaluation(args.emb_file_name, args.similarity_test_paths, args.synset_paths, args.analogy_test_paths)
def select(self, pairs, kneighbor): score = dict() for keyn in tqdm(kneighbor.keys()): score[keyn] = [] for value in kneighbor[keyn]: s = 0 i = 0 for keyp in pairs.keys(): if keyp[0] == keyn: replace = tuple([value] + list(keyp[1:])) if replace in pairs: s += pairs[replace] / pairs[keyp] i += 1 else: s += 0 i += 1 score[keyn].append(s / i) return score if __name__ == '__main__': logging_set('NSselect.log') ns = NSselect(input_wvectors=sys.argv[1], input_word2id=sys.argv[2], input_id2word=sys.argv[3], input_vocabulary=sys.argv[4], pair_file_path=sys.argv[5], kn_file_name=sys.argv[6], output_file_name=sys.argv[7])
import os from utils import load_from_pkl, dump_to_pkl, logging_set from tqdm import tqdm import logging import gc logging_set('merge_pair.log') path = 'data/pair' files = os.listdir(path)[1:] pairs = dict() for idx, file in enumerate(tqdm(files)): if idx % 20 == 0: gc.collect() #手动触发 内存回收 if not os.path.isdir(file): pair_file_path = path + "/" + file pair = load_from_pkl(pair_file_path) logging.info("pair size: %d" % (len(pair))) if len(pairs) == 0: pairs = pair else: for key in pair.keys(): if key in pairs: pairs[key] += pair[key] else: pairs[key] = pair[key] logging.info("current total pair size: %d" % (len(pairs))) output_file_name = 'data/pairs.pkl' dump_to_pkl(pairs, output_file_name)