Пример #1
0
def main():
    os.makedirs(args.wiki_preprocess, exist_ok=True)

    dummy = DummyVocab()
    datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999)
    summarize_datasets(datasets)

    prefix_vocab = pygtrie.CharTrie()
    suffix_vocab = pygtrie.CharTrie()
    for _, dataset in tqdm(
            datasets.items(), dynamic_ncols=True, desc="Build Tries"):
        for mentions in dataset:
            for mention in mentions:
                mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip()
                prefix_vocab[mention_text + " "] = True
                suffix_vocab[mention_text[::-1] + " "] = True
    print(f'Forward trie size: {len(prefix_vocab)}')
    print(f'Backward trie size: {len(suffix_vocab)}')

    paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*')))
    paths = sorted(
        paths,
        key=lambda p: int(os.path.basename(p).split('-')[4][11:-4]))
    params = [(path, prefix_vocab, suffix_vocab) for path in paths]

    prefix_count = dict()
    suffix_count = dict()
    total_pages = 0
    with mp.Pool(processes=args.cpu) as pool, \
            tqdm(total=len(paths), dynamic_ncols=True) as pbar:
        for i, res in enumerate(pool.imap_unordered(process_stream, params)):
            prefix_count_, suffix_count_, page_counter = res
            total_pages += page_counter
            update_counts(prefix_count, prefix_count_)
            update_counts(suffix_count, suffix_count_)
            pbar.write(
                f'pages: {total_pages}, '
                f'# forward: {len(prefix_count)}/{len(prefix_vocab)}, '
                f'# backward: {len(suffix_count)}/{len(suffix_vocab)}')
            pbar.update()
            if i % 10 == 0:
                dump(prefix_count, suffix_count)
            del prefix_count_, suffix_count_
    dump(prefix_count, suffix_count)
Пример #2
0
 def __init__(
     self,
     model,
     validation_size,
     test_size,
     data_dir='data/',
     log_dir='log/',
 ):
     self.model = model
     self.data_dir = data_dir
     self.log_dir = log_dir + '/' + model.name + '/'
     self.validation_size = validation_size
     self.test_size = test_size
     self.data = dataset.read_datasets(data_dir, validation_size, test_size)
     self.learning_rate = tf.Variable(float(model.learning_rate),
                                      trainable=False)
     self.learning_rate_decay_op = self.learning_rate.assign(
         self.learning_rate * model.learning_rate_decay_factor)
     self.global_step = tf.Variable(0, name='global_step', trainable=False)
     self.dropout = tf.placeholder(tf.float32, name='keep_prob')
Пример #3
0
 def __init__(self,
              name,
              datadir,
              validation_size=500,
              test_size=1,
              batch_size=100,
              learning_rate=0.01,
              learning_rate_decay_factor=.1,
              max_steps=1000,
              rnn_cell_size=64,
              num_rnn_layers=1,
              grad_clip=10,
              conv_filter_sizes=FilterSizes(16, 16, 16, 16),
              embedding_dims=dataset.EmbeddingSize(**{
                  'chars': 5,
                  'fonts': 3,
                  'fontsizes': 2,
                  'tokens': 10
              }),
              use_lstm=False,
              use_rnn_layer_norm=False,
              dropout_keep_prob=1.0):
     self.name = name
     self.data = dataset.read_datasets(datadir, validation_size, test_size)
     self.batch_size = batch_size
     self.learning_rate = learning_rate
     self.learning_rate_decay_factor = learning_rate_decay_factor
     self.grad_clip = grad_clip,
     self.max_steps = max_steps
     self.rnn_cell_size = rnn_cell_size
     self.num_rnn_layers = num_rnn_layers
     self.feature_vocab_size = self.data.feature_vocab_size
     self.token_vocab_size = self.data.token_vocab_size
     self.filters = conv_filter_sizes
     self.embedding_dims = embedding_dims
     self.use_lstm = use_lstm
     self.use_rnn_layer_norm = use_rnn_layer_norm
     self.dropout_keep_prob = dropout_keep_prob
     self.feature_dim = (embedding_dims.chars + embedding_dims.fonts +
                         embedding_dims.fontsizes)
Пример #4
0
def main():
    persons = read_datasets()
    rgb_data = persons['P7']['G9'][1]['rgb']
    depth_png = persons['P7']['G9'][1]['depth_png']
    extract_hand(rgb_data, depth_png)
Пример #5
0
parser = argparse.ArgumentParser()
parser.add_argument('--dataroot', type=str, default='./data_2014_2021dca')
parser.add_argument('--wiki_preprocess',
                    type=str,
                    default='./preprocess_2014in2021/')
parser.add_argument('--wiki2vec',
                    type=str,
                    default='./wiki2vec/enwiki-20210420-vec-nolowercase')
args = parser.parse_args()

if __name__ == '__main__':
    os.makedirs(args.wiki_preprocess, exist_ok=True)

    dummy = DummyVocab()
    datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999)
    summarize_datasets(datasets)

    word_vocab = set()
    entity_vocab = set()
    for name, dataset in tqdm(datasets.items(),
                              leave=False,
                              dynamic_ncols=True,
                              desc="Create word_vocab and entity_vocab"):
        for mentions in dataset:
            for mention in mentions:
                for word in mention.lctx_tokens + mention.rctx_tokens:
                    word = word.strip()
                    if len(word) > 0:
                        word_vocab.add(word)
                for candidate in mention.candidates:
Пример #6
0
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if args.mode != 'eval':
        os.makedirs(args.logdir)

    voca_emb_dir = os.path.join(args.dataroot, 'embedding')
    word_voca, word_embedding = load_voca_embs(
        voca_path=os.path.join(voca_emb_dir, 'word_vocab.txt'),
        embs_path=os.path.join(voca_emb_dir, 'word_embedding.npy'))
    entity_voca, entity_embedding = load_voca_embs(
        voca_path=os.path.join(voca_emb_dir, 'entity_vocab.txt'),
        embs_path=os.path.join(voca_emb_dir, 'entity_embedding.npy'))
    datasets = read_datasets(args.dataroot,
                             word_voca,
                             entity_voca,
                             ctx_window=args.ctx_window)
    datasets = prerank_dataset(args.dataroot,
                               datasets,
                               n_candidates=args.n_candidates,
                               n_topk=args.n_prerank_topk,
                               n_topk_p=args.n_prerank_topk_p,
                               ctx_window=args.prerank_ctx_window,
                               word_embedding=word_embedding,
                               entity_embedding=entity_embedding)
    summarize_datasets(datasets)

    ent_inlinks_dict = read_inlinks(args.dataroot, word_voca, entity_voca)

    config = {
        'hid_dims': args.hid_dims,
Пример #7
0
parser.add_argument('--seed', type=int, help='random seed')


def load_config(config_file):
    with open(config_file, 'r') as f:
        args = parser.parse_args(reduce(lambda a, b: a+b, map(lambda x: ('--'+x).strip().split(), f.readlines())))
    return args

opt = parser.parse_args()
if opt.options is not None:
    opt = load_config(opt.options)
if opt.seed is not None:
    np.random.seed(opt.seed)

# dataset
dataset = read_datasets(opt.data_path)
print(dataset)
train_provider = HNDataProvider(dataset.train, dataset.embeddings, batch_size=opt.batch_size, num_neg_samples=opt.num_neg_samples)
test_provider = HNDataProvider(dataset.test, dataset.embeddings, batch_size=opt.batch_size, num_neg_samples=opt.num_neg_samples)

# net
opt.dim_feature = [sum(dataset.train.nums_type)-n for n in dataset.train.nums_type]
net = hypergraph_network(opt)
# # if need to load model
# load_model(net, opt)

# optimizer
optimizer = torch.optim.RMSprop(net.parameters(), lr=opt.learning_rate)

# loss
BCE_loss = nn.BCELoss()
Пример #8
0
    if evaluation:
        np.savetxt('eval_train_features.csv', train_features, delimiter=",")
        np.savetxt('eval_train_labels.csv', train_labels, delimiter=",")
        np.savetxt('eval_test_features.csv', test_features, delimiter=",")
    else:
        np.savetxt('train_features.csv', train_features, delimiter=",")
        np.savetxt('train_labels.csv', train_labels, delimiter=",")
        np.savetxt('test_features.csv', test_features, delimiter=",")

    print('Learning model...')
    lr_model = model.learn_model(train_features, train_labels)
    probabilities = model.apply_model(test_features, lr_model)

    print('Preparing solution...')
    if evaluation:
        solution.prepare_solutions_for_evaluation(tweets_test, probabilities[:, 0])
    else:
        solution.prepare_solutions(tweets_test, probabilities[:, 0])


if __name__ == "__main__":
    evaluation = False

    print('Loading datasets...')
    if evaluation:
        tweets_train, tweets_test = dataset.read_evaluation_datasets()
    else:
        tweets_train, tweets_test = dataset.read_datasets()
    run(tweets_train, tweets_test)
Пример #9
0
def main():
    path = os.path.join(args.wiki_preprocess, 'prefix_counts.txt')
    prefix_counts = pygtrie.CharTrie()
    with open(path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            assert obj['mention'] not in prefix_counts
            prefix_counts[obj['mention']] = obj['counts']

    path = os.path.join(args.wiki_preprocess, 'suffix_counts.txt')
    suffix_counts = pygtrie.CharTrie()
    with open(path, 'r') as f:
        for line in f:
            obj = json.loads(line)
            assert obj['mention'] not in suffix_counts
            suffix_counts[obj['mention']] = obj['counts']

    dummy = DummyVocab()
    datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999)
    summarize_datasets(datasets)

    entity_vocab = set()
    for name, dataset in datasets.items():
        for mentions in tqdm(dataset, dynamic_ncols=True, desc="Top 100"):
            for mention in mentions:
                counts = dict()

                mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip()
                try:
                    for _, part_counts in prefix_counts.iteritems(
                            prefix=mention_text + " "):
                        for entity, count in part_counts.items():
                            counts[entity] = counts.get(entity, 0) + count
                except KeyError:
                    pass

                mention_text = mention_text[::-1]
                try:
                    for _, part_counts in suffix_counts.iteritems(
                            prefix=mention_text + " "):
                        for entity, count in part_counts.items():
                            counts[entity] = counts.get(entity, 0) + count
                except KeyError:
                    pass

                candidates = [
                    k for k, v in
                    sorted(list(counts.items()),
                           key=lambda x: x[1], reverse=True)]
                for entity in candidates[:100]:
                    entity_vocab.add(entity)

    entity_vocab = [(entity,) for entity in sorted(list(entity_vocab))]
    print(f'# Entity: {len(entity_vocab)}')

    with mp.Pool(processes=args.cpu) as pool:
        with tqdm(total=len(entity_vocab), dynamic_ncols=True) as pbar:
            response_path = os.path.join(args.wiki_preprocess, 'mapping.txt')
            with open(response_path, 'w') as f:
                results = pool.imap_unordered(process_entity, entity_vocab)
                for i, response in enumerate(results):
                    if response is not None:
                        f.write(
                            json.dumps(response, ensure_ascii=False) + "\n")
                    pbar.update()
Пример #10
0
def main():
    os.makedirs(args.wiki_preprocess, exist_ok=True)
    mapping = dict()
    path = os.path.join(args.wiki_preprocess, 'mapping.txt')
    with open(path, 'r') as f, tqdm(f,
                                    dynamic_ncols=True,
                                    desc='Mapping table') as pbar:
        for line in pbar:
            obj = json.loads(line)
            entity = obj['entity']
            try:
                response = obj['response']
                if len(response['query']['search']) > 0:
                    real_entity = response['query']['search'][0]['title']
                    if real_entity != entity:
                        if unidecode(real_entity) == entity:
                            mapping[entity] = real_entity
                            # pbar.write(f'{entity} -> {real_entity}')
                        # elif len(real_entity) == len(entity):
                        #     mapping[entity] = real_entity
            except KeyError:
                pass

    path = os.path.join(args.wiki_preprocess, 'prefix_counts.txt')
    prefix_counts = pygtrie.CharTrie()
    with open(path, 'r') as f:
        for line in tqdm(f, dynamic_ncols=True, desc="Prefix counts"):
            obj = json.loads(line)
            assert obj['mention'] not in prefix_counts
            counts = dict()
            for entity, count in obj['counts'].items():
                if entity in mapping:
                    entity = mapping[entity]
                    # print('???')
                counts[entity] = counts.get(entity, 0) + count
            prefix_counts[obj['mention']] = counts

    path = os.path.join(args.wiki_preprocess, 'suffix_counts.txt')
    suffix_counts = pygtrie.CharTrie()
    with open(path, 'r') as f:
        for line in tqdm(f, dynamic_ncols=True, desc="Suffix counts"):
            obj = json.loads(line)
            assert obj['mention'] not in suffix_counts
            counts = dict()
            for entity, count in obj['counts'].items():
                if entity in mapping:
                    entity = mapping[entity]
                counts[entity] = counts.get(entity, 0) + count
            suffix_counts[obj['mention']] = counts

    dummy = DummyVocab()
    datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999)
    summarize_datasets(datasets)

    for name, dataset in datasets.items():
        top50_num_2014 = 0
        top100_num_2014 = 0
        topinf_num_2014 = 0
        top50_num = 0
        top100_num = 0
        topinf_num = 0
        mention_num = 0
        for mentions in dataset:
            for mention in mentions:
                counts = dict()

                mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip()
                try:
                    for prefix, part_counts in prefix_counts.iteritems(
                            prefix=mention_text + " "):
                        for entity, count in part_counts.items():
                            counts[entity] = counts.get(entity, 0) + count
                except KeyError:
                    pass

                mention_text = mention_text[::-1]
                try:
                    # counts = suffix_counts[mention_text]
                    for suffix, part_counts in suffix_counts.iteritems(
                            prefix=mention_text + " "):
                        for entity, count in part_counts.items():
                            counts[entity] = counts.get(entity, 0) + count
                except KeyError:
                    pass

                if mention.label_index >= 0:
                    index = mention.label_index
                else:
                    index = len(mention.candidates)
                if index < len(mention.candidates):
                    if index < 50:
                        top50_num_2014 += 1
                        top100_num_2014 += 1
                        topinf_num_2014 += 1
                    elif index < 100:
                        top100_num_2014 += 1
                        topinf_num_2014 += 1
                    else:
                        topinf_num_2014 += 1

                candidates = sorted(list(counts.items()),
                                    key=lambda x: x[1],
                                    reverse=True)
                candidates_text = [k for k, _ in candidates]
                total_freq = sum([f for _, f in candidates])
                try:
                    index = candidates_text.index(mention.label)
                except ValueError:
                    index = len(candidates_text)

                if index < len(candidates_text):
                    if index < 50:
                        top50_num += 1
                        top100_num += 1
                        topinf_num += 1
                    elif index < 100:
                        top100_num += 1
                        topinf_num += 1
                    else:
                        topinf_num += 1

                new_candidates = []
                for text, freq in candidates[:50]:
                    new_candidates.append(
                        Candidate(text=text,
                                  type_names=['UNK'],
                                  p_e_m=freq / total_freq))
                mention.set_candidates(new_candidates)
                if index < len(candidates_text) and index < 50:
                    assert mention.label_index == index
                else:
                    assert mention.label_index == -1

                mention_num += 1
        print(f"{name:10s}  {'20210420'} {'201402xx'}")
        print(f'Top50     : {top50_num / mention_num:8.4f} '
              f'{top50_num_2014 / mention_num:8.4f}')
        print(f'Top100    : {top100_num / mention_num:8.4f} '
              f'{top100_num_2014 / mention_num:8.4f}')
        print(f'TopInf    : {topinf_num / mention_num:8.4f} '
              f'{topinf_num_2014 / mention_num:8.4f}')

    summarize_datasets(datasets)
    with open('./data_2021_2021dca/datasets.json', 'w') as f:
        for name in datasets.keys():
            datasets[name] = datasets[name].to_json()
        json.dump(datasets, f, ensure_ascii=False)