示例#1
0
def load():
    global g_userRes, g_userResF, g_userInfoF, g_userInfo
    ret = util.loadPickle(g_userResF) 
    if ret == None:
        return {}, {}
    ret2 = util.loadPickle(g_userInfoF) 
    if ret2 == None:
        return ret, {}
    return ret, ret2
def load():
    global g_userRes, g_userResF, g_userDoneF, g_userDone
    ret = util.loadPickle(g_userResF)
    if ret == None:
        return {}, []
    ret2 = util.loadPickle(g_userDoneF)
    if ret2 == None:
        return ret, []
    return ret, ret2
示例#3
0
def load():
    global g_userRes, g_userResF, g_userInfoF, g_userInfo
    ret = util.loadPickle(g_userResF)
    if ret == None:
        return {}, {}
    ret2 = util.loadPickle(g_userInfoF)
    if ret2 == None:
        return ret, {}
    return ret, ret2
示例#4
0
def load():
    global g_userRes, g_userResF, g_userDoneF, g_userDone
    ret = util.loadPickle(g_userResF)
    if ret == None:
        return {}, []
    ret2 = util.loadPickle(g_userDoneF)
    if ret2 == None:
        return ret, []
    return ret, ret2
示例#5
0
 def loadData(self):
     data_A = util.loadPickle(self.dataset_A)[0]
     data_B = util.loadPickle(self.dataset_B)[0]
     data_A = np.concatenate([*map(lambda x: x.flatten(), data_A)])
     data_B = np.concatenate([*map(lambda x: x.flatten(), data_B)])
     data_A = np.reshape(data_A, (self.shape[0], -1))
     data_B = np.reshape(data_B, (self.shape[0], -1))
     self.data_A = data_A
     self.data_B = data_B
     self.data_len = min(data_A.shape[1], data_B.shape[1])
     self.data_len = self.data_len - (self.data_len // self.shape[1])
示例#6
0
def loadRules(DIR, FILENAME, ruleset, method, ITER):
    if method == 'A' or method == 'B':
        fullpath_rules = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_' + method + '.pkl'
        rules = loadPickle(fullpath_rules)
    elif method == 'both':
        fullpath_rules_A = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_A.pkl'
        fullpath_rules_B = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_B.pkl'
        rules_A = loadPickle(fullpath_rules_A)
        rules_B = loadPickle(fullpath_rules_B)
        rules = rules_A + rules_B
    else:
        print("no method")
    return (rules)
示例#7
0
 def sample_save(self, epoch, batch_i):
     _, coded_sps_mean_A, coded_sps_std_A, coded_sps_max_A, _, _ = util.loadPickle('./cache36_suzuki.pkl')
     wave = util.loadWave(f'./datasets/suzuki/a01.wav')
     pwav = util.wavePadding(wave)
     f0, sp, ap = util.worldDecompose(pwav)
     coded_sp = util.worldEncodeSpectralEnvelop(sp)
     coded_sp_t = coded_sp.T
     coded_sp_norm = (coded_sp_t - coded_sps_mean_A) / coded_sps_max_A
     coded_sp_norm = coded_sp_norm[:,:128*6]
     coded_sp_norm = coded_sp_norm.reshape(6, 36, 128)
     dist = self.g_AB.predict(coded_sp_norm)
     dist = dist.reshape((36, 128*6))
     util.savePickle(f'./predict/log_a01_{epoch}_{batch_i}.pkl', dist)
def countIdf(cWcDict):
    idf = {} 
    D = len(cWcDict.keys()) 
    for c in cWcDict:
        for w in cWcDict[c]:
            if w not in idf:
                cnt = 0
                for cc in cWcDict:
                    if w in cWcDict[cc]:
                        cnt += 1
                idf[w] = math.log(float(D)/cnt, 2)
    return idf

if __name__ == '__main__':
    #load
    wbWc = util.loadPickle('wbWc.pickle')
    infoWc = util.loadPickle('infoWc.pickle')
    cList = loadCirecle('report/circles', 10) 
    cInfoWcDict = {}
    cWbWcDict = {}
    for c in cList:
        cKey = " ".join(c)
        cInfoWcDict[cKey] = {}
        cWbWcDict[cKey] = {}
        for u in c:
            addDict(cInfoWcDict[cKey], infoWc[u])
            addDict(cWbWcDict[cKey], wbWc[u])

    anaCircle(cInfoWcDict, cWbWcDict)

def loadRules(DIR, FILENAME, ruleset, method, ITER):
    fullpath_rules_A = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_A.pkl'
    fullpath_rules_B = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_B.pkl'
    rules_A = loadPickle(fullpath_rules_A)
    rules_B = loadPickle(fullpath_rules_B)
    return (rules_A, rules_B)
示例#10
0
def load():
    global g_userRes, g_userResF
    ret = util.loadPickle(g_userResF) 
    if ret == None:
        return {}
    return ret
示例#11
0
import util
import numpy as np
from scipy.sparse import csr_matrix


def termVectorFromCSR(row_offsets, indices, data):
    offsets = zip(row_offsets[::], row_offsets[1::])
    doc_id = 0
    for (start, end) in offsets:
        yield doc_id, indices[start:end], data[start:end]
        doc_id += 1


term_map = util.loadPickle("vocab.pkl")
doc_map = util.loadPickle("doc_index_map.pkl")

no_of_terms = len(term_map)
no_of_docs = len(doc_map)

# Creating a Compressed Row Sparse Format of the Term-Document Matrix
ROW_OFFSETS = [0]
COLUMN_INDICES = []
VALUES = []
for doc, vector in util.scrollIndex():
    prev_offset = ROW_OFFSETS[-1]
    ROW_OFFSETS.append(prev_offset + len(vector))
    [(COLUMN_INDICES.append(term_map[term]), VALUES.append(count['tf']))
     for (term, count) in vector.iteritems()]

print ROW_OFFSETS
print COLUMN_INDICES
示例#12
0
def loadM(fName):
    return util.loadPickle(fName)
示例#13
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    #os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)


    # ========= Download Dataset json =========
    # You can change this code to load dataset in your own way

    #dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    #dev_filename = os.path.basename(FLAGS.dev_path)
    #_, _, _ = prepare_dev(dev_dirname, dev_filename, vocab)

    # ========= Process input json =========
    # for codalab
    prefix = os.path.join("data", "squad")

    # writes dev.answer, dev.context, dev.question, dev.span
    dev_path = FLAGS.dev_path
    dev_filename = FLAGS.dev_path.split("/")[-1]
    if FLAGS.download:
        dev_data = data_from_json(os.path.join(prefix, dev_filename))
    else:
        dev_data = data_from_json(dev_filename)
    dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix="")
    print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))

    # writes dev.ids.context, dev.ids.question
    vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat")
    dev_deposit_path = pjoin(os.path.join("", ""), "dev") #pjoin(os.path.join("data", "squad"), "dev")
    x_dis_path = dev_deposit_path + ".ids.context"
    y_ids_path = dev_deposit_path + ".ids.question"
    data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path)
    data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path)

    # load data sets
    #Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set.
    Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data_home(dev_deposit_path) # for our purposes this is as test set.

    question_uuid_data = []
    with open(dev_deposit_path + ".quid") as f:
        for line in f:
            question_uuid_data.append((line))

    # pad the data at load-time. So, we don't need to do any masking later!!!
    # ref: https://keras.io/preprocessing/sequence/
    # if len < maxlen, pad with specified val
    # elif len > maxlen, truncate
    QMAXLEN = FLAGS.QMAXLEN
    PMAXLEN = FLAGS.PMAXLEN
    Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post')
    P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post')
    A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post')
    A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post')
    test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    """models = [ 'MPCM', 'COATT', 'COATT_fixed', 'COATT_mix','COATT_fixed_mix', 'COATT_fixed_200_mix'] # 'COATT_fixed_200', leave out to save time
    predictions_start = {}; predictions_end = {}
    with open("preds_dev.txt", "a") as f:
        f.write("model" + "," + "pred_raw" + "," + "a_raw")
        for model in models:
            FLAGS.model_type = model
            FLAGS.train_dir = "train/ensemble_train_" + model
            train_dir = "train/ensemble_train_" + model
            # define sizes etc. for different models.
            if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' :
                FLAGS.embedding_size = 200
                FLAGS.lstm_units = 200
            elif model == "MPCM_p100":
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 100
            else:
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 50
            with tf.Graph().as_default():
                with tf.Session() as sess:
                    embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
                    pretrained_embeddings = embeddings['glove']

                    qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

                    initialize_model(sess, qa, train_dir)

                    # get predicted start-end indices
                    a_s_l = []
                    a_e_l = []

                    f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {}
                    prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
                    for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                        batch_test =  batch[:4]
                        (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                        a_s = (np.argmax(ys, axis=1))
                        a_e = (np.argmax(ye, axis=1))

                        a_s_l = a_s_l + list(a_s)
                        a_e_l = a_e_l + list(a_e)

                        print(len(a_s))
                        for j in range(len(a_s)):
                            p_raw = batch[7][j]
                            a_raw = batch[8][j]
                            s = a_s[j]
                            e = a_e[j]

                            pred_raw = ' '.join(p_raw.split()[s:e + 1])
                            p_raw_mapping[batch[9][j].strip("\n")] = p_raw
                            #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                            prob_start[batch[9][j].strip("\n")] = ys[j]
                            prob_end[batch[9][j].strip("\n")] = ye[j]
                            f.write(model + "," + pred_raw + "," + a_raw )
                        prog.update(i + 1, [("processed", i + 1)])

            predictions_start[model] = prob_start
            predictions_end[model] = prob_end
    f.close()



    # save
    dropPickle(predictions_start, "preds_start.pkl")
    dropPickle(predictions_end, "preds_end.pkl")
    dropPickle(p_raw_mapping, "p_raw_mapping.pkl")"""
    predictions_start = loadPickle("preds_start.pkl")
    predictions_end = loadPickle("preds_end.pkl")
    p_raw_mapping = loadPickle("p_raw_mapping.pkl")


    models = ['COATT_fixed_200']
    #predictions_start = {}; predictions_end = {}
    with open("preds_dev.txt", "a") as f:
        f.write("model" + "," + "pred_raw" + "," + "a_raw")
        for model in models:
            FLAGS.model_type = model
            FLAGS.train_dir = "train/ensemble_train_" + model
            train_dir = "train/ensemble_train_" + model
            if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' :
                FLAGS.embedding_size = 200
                FLAGS.lstm_units = 200
            elif model == "MPCM_p100":
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 100
            else:
                FLAGS.embedding_size = 100
                FLAGS.lstm_units = 100
                FLAGS.perspective_units = 50
            with tf.Graph().as_default():
                with tf.Session() as sess:
                    embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
                    pretrained_embeddings = embeddings['glove']

                    qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

                    initialize_model(sess, qa, train_dir)

                    # get predicted start-end indices
                    a_s_l = []
                    a_e_l = []

                    f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {}
                    prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
                    for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                        batch_test =  batch[:4]
                        (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                        a_s = (np.argmax(ys, axis=1))
                        a_e = (np.argmax(ye, axis=1))

                        a_s_l = a_s_l + list(a_s)
                        a_e_l = a_e_l + list(a_e)

                        print(len(a_s))
                        for j in range(len(a_s)):
                            p_raw = batch[7][j]
                            a_raw = batch[8][j]
                            s = a_s[j]
                            e = a_e[j]
                            print(s,e)# comment this out
                            pred_raw = ' '.join(p_raw.split()[s:e + 1])
                            p_raw_mapping[batch[9][j].strip("\n")] = p_raw
                            #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                            prob_start[batch[9][j].strip("\n")] = ys[j]
                            prob_end[batch[9][j].strip("\n")] = ye[j]
                            f.write(model + "," + pred_raw + "," + a_raw )
                        prog.update(i + 1, [("processed", i + 1)])

            predictions_start[model] = prob_start
            predictions_end[model] = prob_end
    f.close()

    dropPickle(predictions_start, "preds_start.pkl")
    dropPickle(predictions_end, "preds_end.pkl")
    dropPickle(p_raw_mapping, "p_raw_mapping.pkl")

    # combine the predictions of the two models (while making independent start, end predictions)
    """answers = {}
    for qkey in predictions_start['MPCM'].keys():
        ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]
        ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]
        s = (np.argmax(ys))
        arr = ye.copy()
        arr[0:s] = 0
        e = (np.argmax(arr))
        #e = (np.argmax(ye))
        pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1])
        answers[qkey] = pred_raw.strip("\n")"""
    # predict span with max predicted probability (make joint prediction rather than indepenedntly predicitng start and end indices)
    answers = {}
    for qkey in predictions_start['MPCM'].keys():
        ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]\
             *predictions_start['COATT_mix'][qkey]*predictions_start['COATT_fixed_mix'][qkey]\
             *predictions_start['COATT_fixed_200_mix'][qkey]*predictions_start['COATT_fixed_200'][qkey] #to save time
        ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]\
             *predictions_end['COATT_mix'][qkey]*predictions_end['COATT_fixed_mix'][qkey]\
             *predictions_end['COATT_fixed_200_mix'][qkey]*predictions_end['COATT_fixed_200'][qkey] #to save time

        s = 0; e = 0; prodmax = 0
        for si in range(0, len(ys)):
            for ei in range(si, len(ye)):
                prod = ys[si]*ye[ei]
                if prod > prodmax:
                    s = si
                    e = ei
                    prodmax = prod
        print(s,e, prodmax)
        pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]); print(pred_raw)
        answers[qkey] = pred_raw.strip("\n")

        # write to json file to root dir
    with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
        f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def loadInfo(fn):
    return util.loadPickle(fn)
示例#15
0
# -*- coding: utf-8 -*-
import sys
import json
from operator import add
reload(sys)
sys.setdefaultencoding('utf-8')

from config import sc
from util import loadPickle

date = sys.argv[1]

gender = loadPickle("/home/hadoop/chen.cheng/moa/gender24.pkl")
b = sc.broadcast(gender)
data = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/rawData/%s-24/" % (date))
data.cache()

out_male = data.map(lambda x : json.loads(x)).filter(lambda x: x[0][0] and x[0][1])\
    .filter(lambda x: b.value[int(x[0][0])] == "M").map(lambda x: json.dumps(x))

out_male.saveAsTextFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/male/%s/"% (date))

out_female = data.map(lambda x : json.loads(x)).filter(lambda x: x[0][0] and x[0][1])\
    .filter(lambda x: b.value[int(x[0][0])] == "F")\
        .map(lambda x: json.dumps(x))

'''
with open('/home/hadoop/chen.cheng/Chronos/momoid', 'w') as f:
    for item in out:
        f.write("%s\n" %(item ) )
'''
示例#16
0
def load():
    global g_userRes, g_userResF
    ret = util.loadPickle(g_userResF)
    if ret == None:
        return {}
    return ret
示例#17
0
def loadInfo(fn):
    return util.loadPickle(fn)
    idf = {}
    D = len(cWcDict.keys())
    for c in cWcDict:
        for w in cWcDict[c]:
            if w not in idf:
                cnt = 0
                for cc in cWcDict:
                    if w in cWcDict[cc]:
                        cnt += 1
                idf[w] = math.log(float(D) / cnt, 2)
    return idf


if __name__ == '__main__':
    #load
    wbWc = util.loadPickle('wbWc.pickle')
    infoWc = util.loadPickle('infoWc.pickle')
    cList = loadCirecle('report/circles', 10)
    cInfoWcDict = {}
    cWbWcDict = {}
    for c in cList:
        cKey = " ".join(c)
        cInfoWcDict[cKey] = {}
        cWbWcDict[cKey] = {}
        for u in c:
            addDict(cInfoWcDict[cKey], infoWc[u])
            addDict(cWbWcDict[cKey], wbWc[u])

    anaCircle(cInfoWcDict, cWbWcDict)
"""
        print "=" * 50