def get_all_topk_qlist(corpus_dir):
    parallel_list = [
        "0-1000000", "1000000-2000000", "2000000-3000000", "3000000-4000000",
        "4000000-5000000", "5000000-6000000", "6000000-7000000",
        "7000000-8000000", "8000000-9000000", "9000000-10000000",
        "10000000-11000000", "11000000-12000000", "12000000-13000000",
        "13000000-14000000", "14000000-15000000", "15000000-16000000",
        "16000000-None"
    ]
    qlist = list()
    for p in parallel_list:
        target_corpus_fpath = corpus_dir + os.sep + "_2_corpus-without-Raretag-%s.pkl" % p
        for q in load_pickle(target_corpus_fpath):
            yy = int(q.creation_date.split('-')[0])
            if yy >= 2014:
                qlist.append(q)
    print("# all qlist = %s" % len(qlist), get_current_time())
    return qlist
Пример #2
0
    ts_dir = dataset_dir + os.sep + "ts%s" % ts
    ts_parallel_dir = ts_dir + os.sep + "parallel"

    sample_K = "test100000"
    sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K
    sample_K_train_dir = sample_K_dir + os.sep + "train"

    corpus_dir = os.path.join(sample_K_dir, "corpus")
    if not os.path.exists(corpus_dir):
        os.mkdir(corpus_dir)

    # use training data
    qlist = list()
    for f in os.listdir(sample_K_train_dir):
        fpath = os.path.join(sample_K_train_dir, f)
        qlist += load_pickle(fpath)
        print("# qlist = %s" % len(qlist))

    title_corpus_fpath = os.path.join(corpus_dir, "title_corpus.txt")
    if not os.path.exists(title_corpus_fpath):
        build_corpus(qlist, ["title"], title_corpus_fpath)
    else:
        print("title Corpus already exist.")

    desc_text_corpus_fpath = os.path.join(corpus_dir, "desc_text_corpus.txt")
    if not os.path.exists(desc_text_corpus_fpath):
        build_corpus(qlist, ["desc_text"], desc_text_corpus_fpath)
    else:
        print("desc_text Corpus already exist.")

    desc_code_corpus_fpath = os.path.join(corpus_dir, "desc_code_corpus.txt")
Пример #3
0
ts_dir = dataset_dir + os.sep + "ts%s" % ts
# sample_K dir
sample_K = "test100000"
sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K
vocab_dir = os.path.join(sample_K_dir, "vocab")

app_name = "tagrcnn"
app_dir = os.path.join(sample_K_dir, "approach", app_name)
snapshot_dir = os.path.join(app_dir, "snapshot")
if not os.path.exists(snapshot_dir):
    print("snapshot %s not exist!" % snapshot_dir)
    exit()

# input files
text_vocab_fpath = os.path.join(vocab_dir, "title_desc_text_vocab.pkl")
text_vocab = load_pickle(text_vocab_fpath)
text_vocab = vocab_to_index_dict(vocab=text_vocab, ifpad=True)

tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl")
tag_vocab = load_pickle(tag_vocab_fpath)
tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False)

# basic path
test_dir = sample_K_dir + os.sep + "test"
print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts))
snapshot_name = "04-01-19_14-41-13"
checkpoint_dir = os.path.join(app_dir, "snapshot", snapshot_name, "checkpoints")
if not os.path.exists(checkpoint_dir):
    print("check point dir %s not exist!" % checkpoint_dir)
    exit()
Пример #4
0
    help='comma-separated kernel size to use for convolution')
############################################################################

args = parser.parse_args()
# initial
# len
# len_dict = load_pickle(len_dict_fpath)
len_dict = dict()
len_dict["max_title_len"] = 100
len_dict["max_desc_text_len"] = 1000
len_dict["max_desc_code_len"] = 1000
args.max_title_len = len_dict["max_title_len"]
args.max_desc_text_len = len_dict["max_desc_text_len"]
args.max_desc_code_len = len_dict["max_desc_code_len"]
# title vocab
title_vocab = load_pickle(title_vocab_fpath)
title_vocab = vocab_to_index_dict(vocab=title_vocab, ifpad=True)
args.title_embed_num = len(title_vocab)

# desc_text vocab
desc_text_vocab = load_pickle(desc_text_vocab_fpath)
desc_text_vocab = vocab_to_index_dict(vocab=desc_text_vocab, ifpad=True)
args.desc_text_embed_num = len(desc_text_vocab)

# desc_code_vocab
desc_code_vocab = load_pickle(desc_code_vocab_fpath)
desc_code_vocab = vocab_to_index_dict(vocab=desc_code_vocab, ifpad=True)
args.desc_code_embed_num = len(desc_code_vocab)

# tag vocab
tag_vocab = load_pickle(tag_vocab_fpath)
Пример #5
0
# initial
len_dict_fpath = os.path.join(vocab_dir, "len.pkl")
title_vocab_fpath = os.path.join(vocab_dir, "title_vocab.pkl")
desc_text_vocab_fpath = os.path.join(vocab_dir, "desc_text_vocab.pkl")
desc_code_vocab_fpath = os.path.join(vocab_dir, "desc_code_vocab.pkl")
tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl")

# len
# len_dict = load_pickle(len_dict_fpath)
len_dict = dict()
len_dict["max_title_len"] = 100
len_dict["max_desc_text_len"] = 1000
len_dict["max_desc_code_len"] = 1000

# title vocab
title_vocab = load_pickle(title_vocab_fpath)
title_vocab = vocab_to_index_dict(vocab=title_vocab, ifpad=True)

# desc_text vocab
desc_text_vocab = load_pickle(desc_text_vocab_fpath)
desc_text_vocab = vocab_to_index_dict(vocab=desc_text_vocab, ifpad=True)

# desc_code_vocab
desc_code_vocab = load_pickle(desc_code_vocab_fpath)
desc_code_vocab = vocab_to_index_dict(vocab=desc_code_vocab, ifpad=True)

# tag vocab
tag_vocab = load_pickle(tag_vocab_fpath)
tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False)

# predict
Пример #6
0
# basic path
print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts))
#################################################################################

# predict
test_dir = os.path.join(simple_K_dir, "test")
# get sample test data
sample_size = 20000
sample_cnt = 10
all_test_data = list()
sample_test_data_dir = os.path.join(simple_K_dir, "sample_test")

if not os.path.exists(sample_test_data_dir):
    os.mkdir(sample_test_data_dir)
elif len(os.listdir(sample_test_data_dir)) > 0:
    print("sample test data is not empty!")
    exit()

for f in sorted(os.listdir(test_dir)):
    test_data_fpath = os.path.join(test_dir, f)
    test_data = load_pickle(test_data_fpath)
    all_test_data += test_data

for i in range(sample_cnt):
    sample_test_data = random.sample(all_test_data, sample_size)

    sample_test_data_fpath = os.path.join(sample_test_data_dir, "%s_sampled_test_data_%s.pkl" % (i, sample_size))
    save_pickle(sample_test_data, sample_test_data_fpath)
    print("#sample test = %s" % len(sample_test_data))
Пример #7
0
# sample_K dir
sample_K = "test100000"
sample_K_dir = ts_dir + os.sep + "data-%s" % sample_K
vocab_dir = os.path.join(sample_K_dir, "vocab")

app_name = "tagrcnn"
app_dir = os.path.join(sample_K_dir, "approach", app_name)
if not os.path.exists(app_dir):
    os.mkdir(app_dir)
snapshot_dir = os.path.join(app_dir, "snapshot")
if not os.path.exists(snapshot_dir):
    os.mkdir(snapshot_dir)

# input files
text_vocab_fpath = os.path.join(vocab_dir, "title_desc_text_vocab.pkl")
text_vocab = load_pickle(text_vocab_fpath)
text_vocab = vocab_to_index_dict(vocab=text_vocab, ifpad=True)

tag_vocab_fpath = os.path.join(vocab_dir, "tag_vocab.pkl")
tag_vocab = load_pickle(tag_vocab_fpath)
tag_vocab = vocab_to_index_dict(vocab=tag_vocab, ifpad=False)

# basic path
train_dir = sample_K_dir + os.sep + "train"
print("Setting:\ntask : %s\ndataset : %s\nts : %s\n" % (task, dataset, ts))
#################################################################################

# Parameters
# ==================================================
# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1,