def main(): os.makedirs(args.wiki_preprocess, exist_ok=True) dummy = DummyVocab() datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999) summarize_datasets(datasets) prefix_vocab = pygtrie.CharTrie() suffix_vocab = pygtrie.CharTrie() for _, dataset in tqdm( datasets.items(), dynamic_ncols=True, desc="Build Tries"): for mentions in dataset: for mention in mentions: mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip() prefix_vocab[mention_text + " "] = True suffix_vocab[mention_text[::-1] + " "] = True print(f'Forward trie size: {len(prefix_vocab)}') print(f'Backward trie size: {len(suffix_vocab)}') paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*'))) paths = sorted( paths, key=lambda p: int(os.path.basename(p).split('-')[4][11:-4])) params = [(path, prefix_vocab, suffix_vocab) for path in paths] prefix_count = dict() suffix_count = dict() total_pages = 0 with mp.Pool(processes=args.cpu) as pool, \ tqdm(total=len(paths), dynamic_ncols=True) as pbar: for i, res in enumerate(pool.imap_unordered(process_stream, params)): prefix_count_, suffix_count_, page_counter = res total_pages += page_counter update_counts(prefix_count, prefix_count_) update_counts(suffix_count, suffix_count_) pbar.write( f'pages: {total_pages}, ' f'# forward: {len(prefix_count)}/{len(prefix_vocab)}, ' f'# backward: {len(suffix_count)}/{len(suffix_vocab)}') pbar.update() if i % 10 == 0: dump(prefix_count, suffix_count) del prefix_count_, suffix_count_ dump(prefix_count, suffix_count)
def __init__( self, model, validation_size, test_size, data_dir='data/', log_dir='log/', ): self.model = model self.data_dir = data_dir self.log_dir = log_dir + '/' + model.name + '/' self.validation_size = validation_size self.test_size = test_size self.data = dataset.read_datasets(data_dir, validation_size, test_size) self.learning_rate = tf.Variable(float(model.learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * model.learning_rate_decay_factor) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.dropout = tf.placeholder(tf.float32, name='keep_prob')
def __init__(self, name, datadir, validation_size=500, test_size=1, batch_size=100, learning_rate=0.01, learning_rate_decay_factor=.1, max_steps=1000, rnn_cell_size=64, num_rnn_layers=1, grad_clip=10, conv_filter_sizes=FilterSizes(16, 16, 16, 16), embedding_dims=dataset.EmbeddingSize(**{ 'chars': 5, 'fonts': 3, 'fontsizes': 2, 'tokens': 10 }), use_lstm=False, use_rnn_layer_norm=False, dropout_keep_prob=1.0): self.name = name self.data = dataset.read_datasets(datadir, validation_size, test_size) self.batch_size = batch_size self.learning_rate = learning_rate self.learning_rate_decay_factor = learning_rate_decay_factor self.grad_clip = grad_clip, self.max_steps = max_steps self.rnn_cell_size = rnn_cell_size self.num_rnn_layers = num_rnn_layers self.feature_vocab_size = self.data.feature_vocab_size self.token_vocab_size = self.data.token_vocab_size self.filters = conv_filter_sizes self.embedding_dims = embedding_dims self.use_lstm = use_lstm self.use_rnn_layer_norm = use_rnn_layer_norm self.dropout_keep_prob = dropout_keep_prob self.feature_dim = (embedding_dims.chars + embedding_dims.fonts + embedding_dims.fontsizes)
def main(): persons = read_datasets() rgb_data = persons['P7']['G9'][1]['rgb'] depth_png = persons['P7']['G9'][1]['depth_png'] extract_hand(rgb_data, depth_png)
parser = argparse.ArgumentParser() parser.add_argument('--dataroot', type=str, default='./data_2014_2021dca') parser.add_argument('--wiki_preprocess', type=str, default='./preprocess_2014in2021/') parser.add_argument('--wiki2vec', type=str, default='./wiki2vec/enwiki-20210420-vec-nolowercase') args = parser.parse_args() if __name__ == '__main__': os.makedirs(args.wiki_preprocess, exist_ok=True) dummy = DummyVocab() datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999) summarize_datasets(datasets) word_vocab = set() entity_vocab = set() for name, dataset in tqdm(datasets.items(), leave=False, dynamic_ncols=True, desc="Create word_vocab and entity_vocab"): for mentions in dataset: for mention in mentions: for word in mention.lctx_tokens + mention.rctx_tokens: word = word.strip() if len(word) > 0: word_vocab.add(word) for candidate in mention.candidates:
torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.mode != 'eval': os.makedirs(args.logdir) voca_emb_dir = os.path.join(args.dataroot, 'embedding') word_voca, word_embedding = load_voca_embs( voca_path=os.path.join(voca_emb_dir, 'word_vocab.txt'), embs_path=os.path.join(voca_emb_dir, 'word_embedding.npy')) entity_voca, entity_embedding = load_voca_embs( voca_path=os.path.join(voca_emb_dir, 'entity_vocab.txt'), embs_path=os.path.join(voca_emb_dir, 'entity_embedding.npy')) datasets = read_datasets(args.dataroot, word_voca, entity_voca, ctx_window=args.ctx_window) datasets = prerank_dataset(args.dataroot, datasets, n_candidates=args.n_candidates, n_topk=args.n_prerank_topk, n_topk_p=args.n_prerank_topk_p, ctx_window=args.prerank_ctx_window, word_embedding=word_embedding, entity_embedding=entity_embedding) summarize_datasets(datasets) ent_inlinks_dict = read_inlinks(args.dataroot, word_voca, entity_voca) config = { 'hid_dims': args.hid_dims,
parser.add_argument('--seed', type=int, help='random seed') def load_config(config_file): with open(config_file, 'r') as f: args = parser.parse_args(reduce(lambda a, b: a+b, map(lambda x: ('--'+x).strip().split(), f.readlines()))) return args opt = parser.parse_args() if opt.options is not None: opt = load_config(opt.options) if opt.seed is not None: np.random.seed(opt.seed) # dataset dataset = read_datasets(opt.data_path) print(dataset) train_provider = HNDataProvider(dataset.train, dataset.embeddings, batch_size=opt.batch_size, num_neg_samples=opt.num_neg_samples) test_provider = HNDataProvider(dataset.test, dataset.embeddings, batch_size=opt.batch_size, num_neg_samples=opt.num_neg_samples) # net opt.dim_feature = [sum(dataset.train.nums_type)-n for n in dataset.train.nums_type] net = hypergraph_network(opt) # # if need to load model # load_model(net, opt) # optimizer optimizer = torch.optim.RMSprop(net.parameters(), lr=opt.learning_rate) # loss BCE_loss = nn.BCELoss()
if evaluation: np.savetxt('eval_train_features.csv', train_features, delimiter=",") np.savetxt('eval_train_labels.csv', train_labels, delimiter=",") np.savetxt('eval_test_features.csv', test_features, delimiter=",") else: np.savetxt('train_features.csv', train_features, delimiter=",") np.savetxt('train_labels.csv', train_labels, delimiter=",") np.savetxt('test_features.csv', test_features, delimiter=",") print('Learning model...') lr_model = model.learn_model(train_features, train_labels) probabilities = model.apply_model(test_features, lr_model) print('Preparing solution...') if evaluation: solution.prepare_solutions_for_evaluation(tweets_test, probabilities[:, 0]) else: solution.prepare_solutions(tweets_test, probabilities[:, 0]) if __name__ == "__main__": evaluation = False print('Loading datasets...') if evaluation: tweets_train, tweets_test = dataset.read_evaluation_datasets() else: tweets_train, tweets_test = dataset.read_datasets() run(tweets_train, tweets_test)
def main(): path = os.path.join(args.wiki_preprocess, 'prefix_counts.txt') prefix_counts = pygtrie.CharTrie() with open(path, 'r') as f: for line in f: obj = json.loads(line) assert obj['mention'] not in prefix_counts prefix_counts[obj['mention']] = obj['counts'] path = os.path.join(args.wiki_preprocess, 'suffix_counts.txt') suffix_counts = pygtrie.CharTrie() with open(path, 'r') as f: for line in f: obj = json.loads(line) assert obj['mention'] not in suffix_counts suffix_counts[obj['mention']] = obj['counts'] dummy = DummyVocab() datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999) summarize_datasets(datasets) entity_vocab = set() for name, dataset in datasets.items(): for mentions in tqdm(dataset, dynamic_ncols=True, desc="Top 100"): for mention in mentions: counts = dict() mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip() try: for _, part_counts in prefix_counts.iteritems( prefix=mention_text + " "): for entity, count in part_counts.items(): counts[entity] = counts.get(entity, 0) + count except KeyError: pass mention_text = mention_text[::-1] try: for _, part_counts in suffix_counts.iteritems( prefix=mention_text + " "): for entity, count in part_counts.items(): counts[entity] = counts.get(entity, 0) + count except KeyError: pass candidates = [ k for k, v in sorted(list(counts.items()), key=lambda x: x[1], reverse=True)] for entity in candidates[:100]: entity_vocab.add(entity) entity_vocab = [(entity,) for entity in sorted(list(entity_vocab))] print(f'# Entity: {len(entity_vocab)}') with mp.Pool(processes=args.cpu) as pool: with tqdm(total=len(entity_vocab), dynamic_ncols=True) as pbar: response_path = os.path.join(args.wiki_preprocess, 'mapping.txt') with open(response_path, 'w') as f: results = pool.imap_unordered(process_entity, entity_vocab) for i, response in enumerate(results): if response is not None: f.write( json.dumps(response, ensure_ascii=False) + "\n") pbar.update()
def main(): os.makedirs(args.wiki_preprocess, exist_ok=True) mapping = dict() path = os.path.join(args.wiki_preprocess, 'mapping.txt') with open(path, 'r') as f, tqdm(f, dynamic_ncols=True, desc='Mapping table') as pbar: for line in pbar: obj = json.loads(line) entity = obj['entity'] try: response = obj['response'] if len(response['query']['search']) > 0: real_entity = response['query']['search'][0]['title'] if real_entity != entity: if unidecode(real_entity) == entity: mapping[entity] = real_entity # pbar.write(f'{entity} -> {real_entity}') # elif len(real_entity) == len(entity): # mapping[entity] = real_entity except KeyError: pass path = os.path.join(args.wiki_preprocess, 'prefix_counts.txt') prefix_counts = pygtrie.CharTrie() with open(path, 'r') as f: for line in tqdm(f, dynamic_ncols=True, desc="Prefix counts"): obj = json.loads(line) assert obj['mention'] not in prefix_counts counts = dict() for entity, count in obj['counts'].items(): if entity in mapping: entity = mapping[entity] # print('???') counts[entity] = counts.get(entity, 0) + count prefix_counts[obj['mention']] = counts path = os.path.join(args.wiki_preprocess, 'suffix_counts.txt') suffix_counts = pygtrie.CharTrie() with open(path, 'r') as f: for line in tqdm(f, dynamic_ncols=True, desc="Suffix counts"): obj = json.loads(line) assert obj['mention'] not in suffix_counts counts = dict() for entity, count in obj['counts'].items(): if entity in mapping: entity = mapping[entity] counts[entity] = counts.get(entity, 0) + count suffix_counts[obj['mention']] = counts dummy = DummyVocab() datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999) summarize_datasets(datasets) for name, dataset in datasets.items(): top50_num_2014 = 0 top100_num_2014 = 0 topinf_num_2014 = 0 top50_num = 0 top100_num = 0 topinf_num = 0 mention_num = 0 for mentions in dataset: for mention in mentions: counts = dict() mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip() try: for prefix, part_counts in prefix_counts.iteritems( prefix=mention_text + " "): for entity, count in part_counts.items(): counts[entity] = counts.get(entity, 0) + count except KeyError: pass mention_text = mention_text[::-1] try: # counts = suffix_counts[mention_text] for suffix, part_counts in suffix_counts.iteritems( prefix=mention_text + " "): for entity, count in part_counts.items(): counts[entity] = counts.get(entity, 0) + count except KeyError: pass if mention.label_index >= 0: index = mention.label_index else: index = len(mention.candidates) if index < len(mention.candidates): if index < 50: top50_num_2014 += 1 top100_num_2014 += 1 topinf_num_2014 += 1 elif index < 100: top100_num_2014 += 1 topinf_num_2014 += 1 else: topinf_num_2014 += 1 candidates = sorted(list(counts.items()), key=lambda x: x[1], reverse=True) candidates_text = [k for k, _ in candidates] total_freq = sum([f for _, f in candidates]) try: index = candidates_text.index(mention.label) except ValueError: index = len(candidates_text) if index < len(candidates_text): if index < 50: top50_num += 1 top100_num += 1 topinf_num += 1 elif index < 100: top100_num += 1 topinf_num += 1 else: topinf_num += 1 new_candidates = [] for text, freq in candidates[:50]: new_candidates.append( Candidate(text=text, type_names=['UNK'], p_e_m=freq / total_freq)) mention.set_candidates(new_candidates) if index < len(candidates_text) and index < 50: assert mention.label_index == index else: assert mention.label_index == -1 mention_num += 1 print(f"{name:10s} {'20210420'} {'201402xx'}") print(f'Top50 : {top50_num / mention_num:8.4f} ' f'{top50_num_2014 / mention_num:8.4f}') print(f'Top100 : {top100_num / mention_num:8.4f} ' f'{top100_num_2014 / mention_num:8.4f}') print(f'TopInf : {topinf_num / mention_num:8.4f} ' f'{topinf_num_2014 / mention_num:8.4f}') summarize_datasets(datasets) with open('./data_2021_2021dca/datasets.json', 'w') as f: for name in datasets.keys(): datasets[name] = datasets[name].to_json() json.dump(datasets, f, ensure_ascii=False)