Пример #1
0
def load_args(snapshot_dir):
    ############################ model arguments settings ############################
    parser = argparse.ArgumentParser(description='Multi-label Classifier based on Multi-component')
    args = parser.parse_args()
    arg_dict = args.__dict__

    # load arguments from arg.json
    print("Processing snapshot %s" % (snapshot_dir), get_current_time())
    arg_json = load_json(os.path.join(snapshot_dir, "args.json"))
    for key, val in arg_json.items():
        try:
            if key == "device":
                val = -1
            if key != "model_selection":
                val = eval(val)
        except Exception as e:
            pass
        finally:
            arg_dict[key] = val

    arg_dict["train"] = None

    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("Loaded args.", get_current_time())

    return args
Пример #2
0
def load_model(args, param_fpath):
    # model selection
    if args.model_selection == 'all':
        from main.tag_rec.approaches.post2vec.models.model_all import MultiComp

        model = MultiComp(args)
    elif args.model_selection == 'title':
        from main.tag_rec.approaches.post2vec.models.model_title import MultiComp

        model = MultiComp(args)
    elif args.model_selection == 'title_desc_text':
        from main.tag_rec.approaches.post2vec.models.model_title_desc_text import MultiComp

        model = MultiComp(args)
    else:
        print("No such model!")
        exit()

    print("Inited model %s use param %s." % (args.model_selection, param_fpath), get_current_time())

    model.load_state_dict(torch.load(param_fpath))

    if args.cuda:
        torch.cuda.set_device(-1)
        model = model.cuda()

    print("Loaded model.", get_current_time())

    return model
Пример #3
0
def load_corpus_csv(corpus_fpath):
    """
    return SOQuestion list from corpus
    :param path_file:
    :return:
    """
    import pandas as pd

    print("Loading corpus %s" % corpus_fpath, get_current_time())
    data = list()
    line_count = 0
    for index, row in pd.read_csv(corpus_fpath).iterrows():
        # ["id","title","desc_text","desc_code","creation_date","tags"]
        qid = row["id"]
        title = ast.literal_eval(row["title"])
        desc_text = ast.literal_eval(row["desc_text"])
        desc_code = ast.literal_eval(row["desc_code"])
        creation_date = row["creation_date"]
        tags = ast.literal_eval(row["tags"])
        soq = Question(qid, title, desc_text, desc_code, creation_date, tags)
        data.append(soq)
        line_count += 1
        if line_count % 10000 == 0:
            print("Loaded %d instances..." % line_count, get_current_time())
    print('Processed {%s} lines.' % line_count, get_current_time())
    return data
Пример #4
0
def build_len_dict(qlist):
    print("Building vocab...", get_current_time())

    # leng_fpath
    title_len_list = list()
    desc_text_len_list = list()
    desc_code_len_list = list()

    sent_num = 0
    for q in qlist:
        title_len_list.append(len(q.title))
        desc_text_len_list.append(len(q.desc_text))
        desc_code_len_list.append(len(q.desc_code))

        sent_num += 1
        if sent_num % 10000 == 0:
            print("Processing %s question..." % sent_num, get_current_time())

    # len
    len_dict = dict()
    len_dict["max_title_len"] = max(title_len_list) if max(title_len_list) < 100 else 100
    len_dict["max_desc_text_len"] = max(desc_text_len_list) if max(desc_text_len_list) < 1000 else 1000
    len_dict["max_desc_code_len"] = max(desc_code_len_list) if max(desc_code_len_list) < 1000 else 1000

    print("Processed %s questions." % sent_num, get_current_time())
    return len_dict
Пример #5
0
def train(train_iter, dev_iter, model, args, global_train_step):
    if args.cuda:
        model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    steps = 0
    model.train()
    for epoch in range(1, args.epochs + 1):
        print("\n#epoch %s" % epoch, get_current_time())
        for batch in train_iter:
            # features
            t = np.array(get_specific_comp_list("title", batch))
            dt = np.array(get_specific_comp_list("desc_text", batch))
            dc = np.array(get_specific_comp_list("desc_code", batch))
            # label
            target = get_specific_comp_list("tags", batch)

            t = torch.tensor(t).long()
            dt = torch.tensor(dt).long()
            dc = torch.tensor(dc).long()
            target = torch.tensor(target).float()

            if args.cuda:
                t, dt, dc, target = t.cuda(), dt.cuda(), dc.cuda(), target.cuda()

            optimizer.zero_grad()
            logit = model(t, dt, dc)
            # debug
            # print("logit.reshape(-1) shape %s"%logit.reshape(-1).shape)
            # print("logit.reshape(-1).[:10] %s"%logit.reshape(-1)[:10])
            # print("target.reshape(-1) shape %s"%target.reshape(-1).shape)
            # print("target.reshape(-1)[:10] %s"%target.reshape(-1)[:10])
            loss = nn.BCELoss()
            loss = loss(logit.reshape(-1), target.reshape(-1))

            loss.backward()
            optimizer.step()

            steps += 1
            global_train_step += 1
            if steps % args.log_interval == 0:
                sys.stdout.write('\rBatch[{}] - loss: {:.10f}'.format(steps, loss))
            if global_train_step % args.save_interval == 0:
                print("\nglobal_train_step {} - step {} - loss {:.10f}".format(global_train_step, steps, loss),
                      get_current_time())
                save(model, args.save_dir, 'snapshot', global_train_step)
    return model, global_train_step
Пример #6
0
def build_tag_vacab(tag):
    print("Building vocabulary...", get_current_time())
    new_tags = list()
    for t in tag:
        new_tags += t
    new_tags = sorted(list(set(new_tags)))
    return new_tags
Пример #7
0
def build_w2v_model(corpus_fpath, model_fpath):
    print('training %s' % corpus_fpath, get_current_time())
    # size is the dimensionality of the feature vectors.
    # window is the maximum distance between the current and predicted word within a sentence.
    # min_count = ignore all words with total frequency lower than this.
    # workers = use this many worker threads to train the model (=faster training with multicore machines).
    sentences = LineSentence(corpus_fpath)
    model = Word2Vec(sentences, size=200, workers=10, min_count=1)

    model.save(model_fpath)

    # vocab = dict()
    # wlist = model.wv.index2word
    # for i in range(len(wlist)):
    #     vocab[wlist[i]] = i
    #
    # save_pickle(vocab, vocab_fpath)
    print('end time : ', get_current_time())
Пример #8
0
def build_tag_vocab(qlist):
    print("Building vocab...", get_current_time())

    tag_vocab = set()

    sent_num = 0
    for q in qlist:

        # tags
        for t in q.tags:
            if t not in tag_vocab:
                tag_vocab.add(t)

        sent_num += 1
        if sent_num % 10000 == 0:
            print("Processing %s question..." % sent_num, get_current_time())

    print("Processed %s questions." % sent_num, get_current_time())
    return tag_vocab
def build_corpus(all_fpath, rare_tags, corpus_fpath):
    import pandas as pd
    print("Building raw corpus and doing some pre processing...")
    cnt = 0
    filter_cnt = 0
    df = pd.read_csv(all_fpath)
    q_list = list()
    for idx, row in df.iterrows():
        try:
            qid = row['id']
            title = ast.literal_eval(row['title'])
            desc_text = ast.literal_eval(row['desc_text'])
            desc_code = ast.literal_eval(row['desc_code'])
            creation_date = row['creation_date']
            tags = ast.literal_eval(row['tags'])
            # remove rare tags
            clean_tags = list(set(tags) - set(rare_tags))

            if len(clean_tags) == 0:
                filter_cnt += 1
                continue
            try:
                q_list.append(
                    Question(qid, title, desc_text, desc_code, creation_date,
                             clean_tags))
                cnt += 1
            except Exception as e:
                print("Skip id=%s" % qid)
                print("Error msg: %s" % e)

            if cnt % 10000 == 0:
                print(
                    "Writing %d instances, filter %d instances..." %
                    (cnt, filter_cnt), get_current_time())
        except Exception as e:
            print("Skip qid %s because %s" % (qid, e))
            filter_cnt += 1

    save_pickle(q_list, corpus_fpath)
    print("Write %s lines successfully." % cnt)
    print("Corpus building sucessfully! %s" % corpus_fpath,
          get_current_time() + '\n')
Пример #10
0
def load_tag_cnt(tag_cnt_dict_fpath):
    print("loading tag cnt dict...", get_current_time())
    tag_cnt_dict = {}
    with open(tag_cnt_dict_fpath) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader)
        for row in reader:
            tag = row[0]
            cnt = row[1]
            tag_cnt_dict[tag] = int(cnt)
    return tag_cnt_dict
Пример #11
0
def build_vocab(text):
    print("Building vocabulary...", get_current_time())
    new_text = list()
    for t in text:
        new_text += t
    new_text = sorted(list(set(new_text)))
    dictionary = dict()
    for i in range(len(new_text)):
        dictionary[new_text[i]] = i
    dictionary['<PAD>'] = len(new_text)
    return dictionary
Пример #12
0
def build_corpus(qlist, comp_list, corpus_fpath):
    corpus_f = open(corpus_fpath, "w")
    skip_cnt = 0

    cnt = 0
    print("processing %s" % fpath, get_current_time())
    for q in qlist:
        try:
            for comp in comp_list:
                str_tmp = ' '.join(q.get_comp_by_name(comp)).strip()
                if str_tmp.strip() == '':
                    continue
                corpus_f.write(str_tmp + '\n')
            cnt += 1
            if cnt % 50000 == 0:
                print("Processed %s questions." % cnt, get_current_time())
        except Exception as e:
            skip_cnt += 1
            print("Skip %s because %s" % (skip_cnt, e))

    corpus_f.close()
    print("corpus %s building finished." % corpus_fpath)
Пример #13
0
def build_tf_idf_vocab(comp_list, qlist):
    min_count = len(qlist) / 100000
    if min_count > 50:
        min_count = 50
    print("comp list %s, min count %s" % (comp_list, min_count))
    word_dict = dict()
    sent_num = 0
    print("Computing tf-idf...", get_current_time())
    for q in qlist:
        sent_num += 1
        cur_word_set = set()
        comp_word_list = list()
        for comp in comp_list:
            comp_word_list += q.get_comp_by_name(comp)
        for w in comp_word_list:
            if w not in cur_word_set:
                cur_word_set.add(w)
                if w not in word_dict:
                    word_dict[w] = {"tf": 1, "idf": 1}
                else:
                    word_dict[w]["tf"] += 1
                    word_dict[w]["idf"] += 1
            else:
                word_dict[w]["tf"] += 1
        if sent_num % 10000 == 0:
            print("Processed %s questions component %s." % (sent_num, comp_list), get_current_time())

    for w in word_dict.copy().keys():
        if word_dict[w]["tf"] >= min_count:
            tf = word_dict[w]["tf"]
            word_dict[w]["idf"] = math.log(sent_num / float(word_dict[w]["idf"]))
            idf = word_dict[w]["idf"]
            word_dict[w]["tfidf"] = tf * idf
        else:
            word_dict.pop(w)

    print("# %s dict = %s" % (comp_list, len(word_dict)))
    return word_dict
Пример #14
0
def identify_rare_tags(tag_dict, rare_tags_fpath, commom_tags_fpath, ts):
    """
    a tag to be rare if its number of appearances is less than or equal to a predefined threshold ts.
    :param tag_dict:
    :param ts:
    :return:
    """
    rare_tags = []
    common_tags = []
    for t in tag_dict:
        if tag_dict[t] <= ts:
            rare_tags.append(t)
        else:
            common_tags.append(t)
    header = ["tag"]
    write_list_to_csv(rare_tags, rare_tags_fpath, header)
    write_list_to_csv(common_tags, commom_tags_fpath, header)
    print("#rare tags : %s" % len(rare_tags), get_current_time() + '\n')
def get_all_topk_qlist(corpus_dir):
    parallel_list = [
        "0-1000000", "1000000-2000000", "2000000-3000000", "3000000-4000000",
        "4000000-5000000", "5000000-6000000", "6000000-7000000",
        "7000000-8000000", "8000000-9000000", "9000000-10000000",
        "10000000-11000000", "11000000-12000000", "12000000-13000000",
        "13000000-14000000", "14000000-15000000", "15000000-16000000",
        "16000000-None"
    ]
    qlist = list()
    for p in parallel_list:
        target_corpus_fpath = corpus_dir + os.sep + "_2_corpus-without-Raretag-%s.pkl" % p
        for q in load_pickle(target_corpus_fpath):
            yy = int(q.creation_date.split('-')[0])
            if yy >= 2014:
                qlist.append(q)
    print("# all qlist = %s" % len(qlist), get_current_time())
    return qlist
Пример #16
0
def extract_by_id_list(id_list, table_name, rare_tags):
    q_list = list()
    con = mdb.connect('localhost', 'root', 'root', db_name)
    cur = con.cursor()
    batch_size = 500
    total_batch = int(len(id_list) / batch_size) + 1
    count = 0
    for batch_idx in range(total_batch):
        if count % 10000 == 0:
            print('reading %s question from Table %s' % (count, table_name), get_current_time())
        count += batch_size

        batch_sql = "SELECT * FROM %s WHERE PostTypeId = 1 AND Id IN(" % (table_name)
        if batch_idx < total_batch - 1:
            for i in range(batch_size):
                idx = batch_idx * batch_size + i
                batch_sql += ('%s,' % id_list[idx])
        elif batch_idx == total_batch - 1:
            for idx in range(batch_idx * batch_size, len(id_list)):
                batch_sql += ('%s,' % id_list[idx])
        # -1 because need to remove symbol ","
        batch_sql = batch_sql[:-1] + ')'
        try:
            cur.execute(batch_sql)
            results = cur.fetchall()
            for row in results:
                # id,title,body
                id = row[0]
                title = row[11]
                desc = row[6]
                tags = row[12].replace('<', ' ').replace('>', ' ').strip().split()
                clean_tags = list(set(tags) - set(rare_tags))

                raw_desc_text, desc_code = separate_text_code(desc)
                clean_desc_text = clean_html_tags(raw_desc_text)

                q = Question(id=id, title=title, desc_text=clean_desc_text, desc_code=desc_code, tags=clean_tags)
                q_list.append(q)
        except Exception as e:
            print(e)
    cur.close()
    con.close()
    return q_list
Пример #17
0
 def train_step(x_batch, y_batch):
     """
     A single training step
     """
     # global total_recall_5, total_recall_10
     sequence_length = [len(sample) for sample in x_batch]
     feed_dict = {
         rcnn.X: x_batch,
         rcnn.y: y_batch,
         rcnn.sequence_length: sequence_length,
         rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob
     }
     _, step, loss = sess.run([train_op, global_step, rcnn.loss],
                              feed_dict)
     print("x_batch len %s" % len(x_batch))
     print("y_batch len %s" % len(y_batch))
     print("x_batch[0] len %s" % len(x_batch[0]))
     print("y_batch[0] len %s" % len(y_batch[0]))
     time_str = datetime.datetime.now().isoformat()
     if math.isnan(loss):
         print("train step Loss is nan!", get_current_time())
         exit()
     # print("pre loss %s" % pre_loss, get_current_time())
     print("{}: step {}, loss {:g}".format(time_str, step, loss))
Пример #18
0
 def dev_step(x_batch, y_batch, writer=None, test=False):
     """
     Evaluates model on a dev set
     """
     sequence_length = [
         len(np.nonzero(sample)[0]) for sample in x_batch
     ]
     feed_dict = {
         rcnn.X:
         x_batch,
         rcnn.y:
         y_batch,
         rcnn.sequence_length:
         sequence_length,
         # rcnn.max_sequence_length: max_sequence_length,
         rcnn.dropout_keep_prob:
         1.0
     }
     step, loss = sess.run([global_step, rcnn.loss], feed_dict)
     time_str = datetime.datetime.now().isoformat()
     if math.isnan(loss):
         print("dev step Loss is nan! Exit!", get_current_time())
         # exit(0)
     print("{}: step {}, loss {:g}".format(time_str, step, loss))
Пример #19
0
# input
# "id", "tags"
# id-tags-all.csv
id_tags_csv_fpath = dataset_dir + os.sep + "id-tags-all.csv"

# output
# _0_tag-count-all.csv
# "tag", "count"
tag_cnt_all_csv_fapth = dataset_dir + os.sep + "_0_tag-count-all.csv"
tag_cnt_dict = {}
tag_cnt_header = ["tag", "count"]

# tag cnt
row_num = 0
with open(id_tags_csv_fpath, 'r') as id_tags_file:
    rd = csv.reader(id_tags_file, escapechar='\\')
    for row in rd:
        tags = row[1].replace('<', ' ').replace('>', ' ').strip().split()
        row_num += 1
        for t in tags:
            if t in tag_cnt_dict:
                tag_cnt_dict[t] += 1
            else:
                tag_cnt_dict[t] = 1
        if row_num % 10000 == 0:
            print("Processing line %s" % row_num, get_current_time())

write_dict_to_csv(tag_cnt_dict, tag_cnt_all_csv_fapth, tag_cnt_header)
print("# Tags = %s" % len(tag_cnt_dict))
Пример #20
0
        os.mkdir(parallel_dir)
    print("Setting:\ntask : %s\ndataset : %s\n" % (task, dataset))
    st_row_num = 13000000
    et_row_num = 14000000
    print("start line num = %s, end line num = %s" % (st_row_num, et_row_num))

    # input
    # "id", "title", "desc", "creation_date", "tags"
    all_raw_csv_fpath = dataset_dir + os.sep + 'all-with-Raretag.csv'

    # output
    # _0_all-clean-with-Raretag.csv
    # "id", "title", "desc_text", "desc_code", "creation_date", "tags"
    all_clean_csv_fpath = parallel_dir + os.sep + '_0_all-clean-with-Raretag-%s-%s.csv' % (st_row_num, et_row_num)

    print("Preprocessing corpus %s" % all_raw_csv_fpath, get_current_time())
    with open(all_raw_csv_fpath, 'r', encoding='utf-8', errors='surrogatepass') as all:
        rd = csv.reader(all, escapechar='\\')
        row_num = st_row_num
        cnt = 0
        corpus_header = ["id", "title", "desc_text", "desc_code", "creation_date", "tags"]
        with open(all_clean_csv_fpath, 'w') as out:
            wr = csv.writer(out)
            wr.writerow(corpus_header)
            for row in islice(rd, st_row_num, et_row_num):
                row_num += 1
                qid = row[0]
                title = row[1]
                desc = row[2]
                creation_date = row[3]
                tags = row[4].replace('<', ' ').replace('>', ' ').strip().split()

if __name__ == '__main__':
    task = 'tagRec'
    dataset = "SO-05-Sep-2018"
    dataset_dir = data_dir + os.sep + task + os.sep + dataset
    parallel_dir = dataset_dir + os.sep + "parallel"
    st_row_num = 16000000
    et_row_num = None
    # ts
    ts = 50
    ts_dir = dataset_dir + os.sep + "ts%s" % ts
    ts_corpus_dir = ts_dir + os.sep + "corpus"
    if not os.path.exists(ts_corpus_dir):
        os.mkdir(ts_corpus_dir)
    print("start line num = %s, end line num = %s" % (st_row_num, et_row_num))

    # Input:
    target_corpus_fpath = parallel_dir + os.sep + "_0_all-clean-with-Raretag-%s-%s.csv" % (
        st_row_num, et_row_num)
    rare_tags_fpath = ts_dir + os.sep + "_1_rareTags.csv"

    # Output:
    corpus_fpath = ts_corpus_dir + os.sep + "_2_corpus-without-Raretag-%s-%s.pkl" % (
        st_row_num, et_row_num)

    rare_tags = load_tags(rare_tags_fpath)
    build_corpus(target_corpus_fpath, rare_tags, corpus_fpath)

    print('Done.', get_current_time())
Пример #22
0
model = load_model(args, param_fpath)

if args.model_selection == "all":
    from main.tag_rec.approaches.post2vec.models.model_all import eval
elif args.model_selection == "title":
    from main.tag_rec.approaches.post2vec.models.model_title import eval
elif args.model_selection == "title_desc_text":
    from main.tag_rec.approaches.post2vec.models.model_title_desc_text import eval

print("Loading test data...")
# get sample test data
sample_test_data_dir = os.path.join(sample_K_dir, "sample_test")
for f in os.listdir(sample_test_data_dir):
    sample_test_data_fpath = os.path.join(sample_test_data_dir, f)
    sample_test_data = load_pickle(sample_test_data_fpath)
    print("#test data = %s loaded!" % len(sample_test_data), get_current_time())

    processed_test_data = padding_and_indexing_qlist(sample_test_data, len_dict, title_vocab, desc_text_vocab,
                                                     desc_code_vocab, tag_vocab)

    print("random mini batch", get_current_time())
    batches_test = random_mini_batch(processed_test_data, args.batch_size)

    pre, rc, f1, cnt = eval(batches_test, model, args, topk_list)

    pre[:] = [x / cnt for x in pre]
    rc[:] = [x / cnt for x in rc]
    f1[:] = [x / cnt for x in f1]

    print("# test : %s" % cnt)
    print("Precision\t\t%s" % ("\t".join(str(x) for x in pre)))
Пример #23
0
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        device_count={"GPU": 1},
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        print("input_x...", get_current_time())
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name(
            "dropout_keep_prob").outputs[0]
        print("dropout_keep_prob...", get_current_time())

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name(
            "output/predictions").outputs[0]
        print("prediction...", get_current_time())

        # prepare test data
        sample_test_data_dir = os.path.join(sample_K_dir, "sample_test")
        test_data_cnt = 0
        res_str = ''
        for f in os.listdir(sample_test_data_dir):
Пример #24
0
    model.load_state_dict(torch.load(args.snapshot))

if args.cuda:
    torch.cuda.set_device(args.device)
    model = model.cuda()

args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

save_args(args)
#################################################################################

try:
    global_train_step = 0
    train_cnt = 0
    for f in sorted(os.listdir(train_dir)):
        print("\n\n# train file = %s" % train_cnt, get_current_time())
        train_cnt += 1
        train_data_fpath = os.path.join(train_dir, f)
        train_data = load_pickle(train_data_fpath)
        print("padding and indexing train", get_current_time())
        processed_train_data = padding_and_indexing_qlist(
            train_data, len_dict, title_vocab, desc_text_vocab,
            desc_code_vocab, tag_vocab)
        print("random mini batch train", get_current_time())
        batches_train = random_mini_batch(processed_train_data,
                                          args.batch_size)
        print("Start train %s..." % f, get_current_time())
        model, global_train_step = train(train_iter=batches_train,
                                         dev_iter=None,
                                         model=model,
                                         args=args,
Пример #25
0
            }
            step, loss = sess.run([global_step, rcnn.loss], feed_dict)
            time_str = datetime.datetime.now().isoformat()
            if math.isnan(loss):
                print("dev step Loss is nan! Exit!", get_current_time())
                # exit(0)
            print("{}: step {}, loss {:g}".format(time_str, step, loss))

        # Load data
        print("Loading data...")

        f_cnt = 0
        for f in os.listdir(train_dir):

            fpath = os.path.join(train_dir, f)
            print("Processing #%s %s" % (f_cnt, f), get_current_time())
            f_cnt += 1
            train_data = load_pickle(fpath)

            x, y = load_data_and_labels(qlist=train_data,
                                        text_vocab=text_vocab,
                                        max_len=FLAGS.max_len,
                                        tag_vocab=tag_vocab)

            shuffle_indices = np.random.permutation(np.arange(len(y)))
            x_shuffled = x[shuffle_indices]
            y_shuffled = y[shuffle_indices]

            # Split train/test set
            # TODO: This is very crude, should use cross-validation
            dev_sample_index = -1 * int(
    ts_dir = dataset_dir + os.sep + "ts%s" % ts
    ts_corpus_dir = ts_dir + os.sep + "corpus"
    sample_k = "test100000"
    sample_k_dir = ts_dir + os.sep + "data-%s" % sample_k
    if not os.path.exists(sample_k_dir):
        os.mkdir(sample_k_dir)
    print("Setting:\ntask : %s\ndataset : %s\nts : %s\nsample k : %s" %
          (task, dataset, ts, sample_k))

    all_qlist = get_all_topk_qlist(ts_corpus_dir)
    if sample_k == "all" or sample_k == "test100000":
        qlist = all_qlist
    else:
        qlist = sample(all_qlist, sample_k)

    print("Sorting...", get_current_time())
    sorted_qlist = sorted(qlist, key=operator.attrgetter('creation_date'))

    # training:test=X:100000
    size_of_test = 100000
    train_data = sorted_qlist[:int(len(sorted_qlist) - size_of_test)]
    test_data = sorted_qlist[int(len(sorted_qlist) - size_of_test):]

    print("#train = %s, #test = %s" % (len(train_data), len(test_data)))

    print("shuffling...", get_current_time())
    shuffle(train_data)
    shuffle(test_data)

    train_dir = sample_k_dir + os.sep + "train"
    if not os.path.exists(train_dir):
Пример #27
0
def save_pickle(data, fpath):
    print("Saving %s..." % fpath, get_current_time())
    with open(fpath, 'wb') as handle:
        cPickle.dump(data, handle)
    print("Saved.", get_current_time())
Пример #28
0
def load_pickle(fpath):
    print("Loading %s..." % fpath, get_current_time())
    with open(fpath, 'rb') as handle:
        data = cPickle.load(handle)
    print("Loaded.", get_current_time())
    return data
Пример #29
0
    def __init__(self, sequence_length, num_classes, vocab_size,
                 embedding_size, filter_sizes, num_filters, l2_reg_lambda):
        # Placeholders for input, output and dropout

        # with tf.device('/device:GPU:0'):
        self.input_x = tf.placeholder(tf.int32, [None, None], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes],
                                      name="input_y")

        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)
        # print("sequence_length:",sequence_length)
        # Embedding layer tf.device('/cpu:0') ,

        with tf.name_scope("embedding"):
            print("embedding_textcnn...", get_current_time())
            self.W = tf.Variable(tf.random_uniform(
                [vocab_size, embedding_size], -1.0, 1.0),
                                 name="W")
            print(self.input_x.shape)
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(
                self.embedded_chars, -1)
        # Create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer

                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1),
                                name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                name="b")
                conv = tf.nn.conv2d(
                    self.embedded_chars_expanded,
                    # self.embedded_chars_x_flat,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                # relu to tanh
                h = tf.nn.tanh(tf.nn.bias_add(conv, b), name="tanh")
                # Maxpooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add dropout
        # A dropout layer stochastically “disables” a fraction of its neurons.
        # This prevent neurons from co-adapting and forces them to learn individually useful features.
        # The fraction of neurons we keep enabled is defined by the dropout_keep_prob input to our network.
        # We set this to something like 0.5 during training, and to 1 (disable dropout) during evaluation.

        with tf.name_scope("dropout"):
            print("dropout_textcnn...", get_current_time())
            self.h_drop = tf.nn.dropout(self.h_pool_flat,
                                        self.dropout_keep_prob,
                                        name="post2vec")

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            print("output_textcnn...", get_current_time())
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            # sigmoid转化为由0到1的概率
            self.predictions = tf.nn.sigmoid(self.scores, name="predictions")

        # CalculateMean cross-entropy loss
        with tf.name_scope("loss"):
            print("loss_textcnn...", get_current_time())
            cross_entropy = -tf.reduce_sum(
                (self.input_y * tf.log(self.predictions + 1e-9)) +
                (1 - self.input_y) * tf.log(1 - self.predictions + 1e-9),
                name="xentropy")
            # tf.nn.softmax_cross_entropy_with_logits适用于多类别,不适合多标签
            # losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            # losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy + l2_reg_lambda * l2_loss)
Пример #30
0
        input_x = graph.get_operation_by_name("input_X").outputs[0]
        input_sequence_length = graph.get_operation_by_name("input_sequence_length").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # post2vec
        post2vec = graph.get_tensor_by_name("dropout/post2vec/mul:0")

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]

        # prepare test data
        sample_test_data_dir = os.path.join(sample_K_dir, "sample_test")
        test_data_cnt = 0
        res_str = ''
        for f in os.listdir(sample_test_data_dir):
            print("Processing #%s test data %s..." % (test_data_cnt, f), get_current_time())
            test_data_cnt += 1

            sample_test_data_fpath = os.path.join(sample_test_data_dir, f)
            sample_test_data = load_pickle(sample_test_data_fpath)

            print("# test data = %s" % len(sample_test_data), get_current_time())
            x_test, y_test = load_data_and_labels(qlist=sample_test_data, text_vocab=text_vocab, max_len=FLAGS.max_len,
                                                  tag_vocab=tag_vocab)

            # Generate batches for one epoch
            batches = batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = list()