Exemplo n.º 1
0
def load_data(file_name):
    print("Start loading file[%s]" % file_name)
    with open(file_name, "rb") as F:
        data = pickle.load(F)
    F.close()
    print("End loading")
    return (data)
Exemplo n.º 2
0
def save_rev_ids_for_predictions(y_pred, y_test, z, model_name):
    correct_minority = []
    incorrect_minority = []
    total = 0
    correct = 0
    incorrect = 0
    for i in range(0, len(y_test)):
        # if minority class in reality
        if y_test[i] == 0:
            total += 1
            # if correctly predicted minority class
            if y_pred[i] == y_test[i]:
                correct += 1
                correct_minority.append(z[i])
            else:
                incorrect += 1
                incorrect_minority.append(z[i])
    # save it in a file
    with open('correct_' + model_name + '.txt', 'w') as f:
        for item in correct_minority:
            item = item.encode('utf8')
            f.write("%s\n" % item)
    f.close()
    with open('incorrect_' + model_name + '.txt', 'w') as f:
        for item in incorrect_minority:
            item = item.encode('utf8')
            f.write("%s\n" % item)
    f.close()
    return
Exemplo n.º 3
0
    def __init__(self,list_path):
        f = open(list_path,'r')
        self.uttid_dic = {} #initialization
        for ln in f.readlines():
            temp_str = ln.split(' ',1)
            self.uttid_dic[temp_str[0]] = int(temp_str[1])

        f.close()
Exemplo n.º 4
0
def load_word_embeddings(directory, file, dictionary):
    embeddings_index = {}
    f = open(os.path.join(directory, file))
    for line in f:
        word, vec = line.split(' ', 1)
        if word in dictionary:
            embeddings_index[word] = np.array(list(map(float, vec.split())))
    f.close()
    return embeddings_index
Exemplo n.º 5
0
    def __init__(self,train_dist_path, test_dist_path,dis_phone_num,prob_trans_path):
        f = open(train_dist_path,'r')

        # train_prob
        ln=f.readline()
        temp_prob_str = ln.split(' ')[3:-1]
        self.train_prob = torch.FloatTensor(list(map(float, temp_prob_str)))
        # var
        ln=f.readline()
        temp_var_str = ln.split(' ')[3:-1]
        self.train_var = torch.FloatTensor(list(map(float, temp_var_str)))
        f.close()

        # test_prob
        f = open(test_dist_path,'r')
        ln=f.readline()
        temp_prob_str = ln.split(' ')[3:-1]
        self.test_prob = torch.FloatTensor(list(map(float, temp_prob_str)))
        # var
        ln=f.readline()
        temp_var_str = ln.split(' ')[3:-1]
        self.test_var = torch.FloatTensor(list(map(float, temp_var_str)))
        f.close()

        # ratio
        length=len(self.test_prob)
        self.prob_ratio = torch.min(self.test_prob/self.train_prob, torch.ones(length))
        print('self.prob_ratio:')
        print(self.prob_ratio)

        assert max(self.prob_ratio) <=1 , 'The prob ratio is larger than 1!?'
        assert min(self.test_var) > 0, 'The min of test var is <= 0!?'
        self.std_var_ratio = torch.sqrt(self.test_var/self.train_var)
        

        print('self.std_var_ratio:')    
        print(self.std_var_ratio)
        print('self.test_var.sum():')    
        print(self.test_var.sum())

        # The first 4 are zero unvoiced parts
        self.phone_prob = torch.FloatTensor([0,0,0,0, 0.0202, 0.0522, 0.0917, 0.0153, 0.0088, 0.0483, 0.0130, 0.0048, 0.0290, 0.0212, 0.0249, 0.0177, 0.0240, 0.0146, 0.0093, 0.0194, 0.0490, 0.0457, 0.0050, 0.0296, 0.0367, 0.0407, 0.0530, 0.0114, 0.0416, 0.0011, 0.0124, 0.0302, 0.0457, 0.0073, 0.0571, 0.0064, 0.0047, 0.0249, 0.0123, 0.0191, 0.0287, 0.0230, 0.0002])
        self.prob_sum = self.phone_prob.sum()
        phone_num = len(self.phone_prob)

        assert dis_phone_num >=0 , 'The dis_phone_num need to be non negtive!'
        self.dis_phone_num = dis_phone_num 

        threshold_prob = self.prob_sum/((phone_num-dis_phone_num)*0.8)
        self.prob_ratio_upper = torch.max(self.phone_prob/threshold_prob, 0.2*torch.ones(phone_num))

        # load the GMM params
        temp_mat = sio.loadmat(prob_trans_path)
        self.mu_ratio = torch.from_numpy(temp_mat['mu_ratio']).float()
        # weight cumulation sum
        self.comp_wcum = torch.from_numpy(temp_mat['comp_wcum']).float()
Exemplo n.º 6
0
 def memory_read(self):
     E = list()
     try:
         F = open(self.file_name, 'rb')
         E = pickle.load(F)
         F.close()
     except:
         F = open(self.file_name, 'wb')
         F.close()
     return E
Exemplo n.º 7
0
def test(net, testset, testloader, criterian, batch_size, n_class, log_file):
    '''Testing the network
    '''
    net.eval()

    testloss, testacc = 0., 0.
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))

    for (img, label) in testloader:
        img = var(img).cuda()
        label = var(label).cuda()
        #forward pass
        output = net(img)
        #loss
        loss = criterian(output, label)
        testloss += loss.data[0]
        #prediction
        _, predict = torch.max(output, 1)
        num_correct = (predict == label).sum()
        testacc += num_correct.data[0]
        #
        c = (predict == label).squeeze()
        for i in range(batch_size):
            l = label[i].data[0]
            class_correct[l] += c[i].data[0]
            class_total[l] += 1
        #end_for
    #end_for
    testloss /= len(testset)
    testacc /= len(testset)

    f = open(log_file, 'a')
    f.write('\n-------------------\n')

    print("Test: Loss: %.5f, Acc: %.2f %%" % (testloss, 100 * testacc))
    f.write("Test: Loss: %.5f, Acc: %.2f %%\n" % (testloss, 100 * testacc))
    for i in range(10):
        print('Accuracy of %5d : %2d %%' %
              (i, 100 * class_correct[i] / class_total[i]))
        f.write('Accuracy of %5d : %2d %%\n' %
                (i, 100 * class_correct[i] / class_total[i]))
    #end_for

    f.close()
Exemplo n.º 8
0
def save_bert_file(dict,
                   output,
                   dataset_name,
                   model_name,
                   hyper_num,
                   oov_num,
                   f_info_out,
                   include_oov=True):
    logger.info("save info...")
    f_info_out.write(
        f'{model_name}\t{dataset_name}\t{len(dict)}\t{oov_num}\t{hyper_num}\t{include_oov}\n'
    )
    logger.info("save json...")
    dname = os.path.splitext(dataset_name)[0]
    fjson = json.dumps(dict, ensure_ascii=False)
    f = open(os.path.join(output, model_name.replace("/", "-"),
                          dname + ".json"),
             mode="w",
             encoding="utf-8")
    f.write(fjson)
    f.close()
Exemplo n.º 9
0
def predict_perf(model, test_loader, predict_type, ans_dict, beer_index):
    model.eval()
    #run predict
    iter_bar = tqdm(test_loader)
    all_predict = {}
    softmax = nn.Softmax(dim=0)
    cnt_true = 0
    cnt = 0
    out_ans = {}
    for x, ids, answers, attn_mask, token_type_ids in iter_bar:
        batch_size = x.size(0)
        model.zero_grad()
        with torch.no_grad():
            sent = model(x.to(device), attn_mask.to(device),
                         token_type_ids.to(device), predict_type)
            for i in range(x.size(0)):
                if predict_type == "MOVIE":  #binary
                    if sent[i] > 0.5:
                        ans = 1
                    else:
                        ans = 0
                    if int(ids[i]) <= 1000:
                        #if answers[i] > 0:
                        ans_ = 1
                    else:
                        ans_ = 0
                else:
                    ans = round(sent[i].item(), 1)
                    ans_ = ans_dict[ids[i]][beer_index]
                if ans // 0.2 == ans_ // 0.2:
                    cnt_true += 1
                out_ans[ids[i]] = ans
                cnt += 1
    with open('perf_predict.pickle', "wb") as F:
        pickle.dump(out_ans, F)
    F.close()
    print("Performence:", cnt_true / cnt)
Exemplo n.º 10
0
def predict(model, test_loader, predict_name, tokenizer, predict_type):
    model.eval()
    #run predict
    iter_bar = tqdm(test_loader)
    all_predict = {}
    softmax = nn.Softmax(dim=0)
    no_ans = 0
    total_ans_len = 0
    prev_last = -1
    for x, ids, answers, attn_mask, token_type_ids in iter_bar:
        batch_size = x.size(0)
        model.zero_grad()
        with torch.no_grad():
            out_ans_start, out_ans_end = model(x.to(device),
                                               attn_mask.to(device),
                                               token_type_ids.to(device),
                                               "SQUAD")
            for i in range(batch_size):
                c_len = int(torch.sum(token_type_ids[i]).item())
                si = softmax(out_ans_start[i][:c_len])
                ei = softmax(out_ans_end[i][:c_len])
                if predict_type == "BEER":
                    ans_start = torch.argmax(si)
                    ans_end = torch.argmax(ei)
                    if ans_start == 0 or ans_end == 0:
                        new_ans = ""
                    else:
                        si[0] = 0
                        ei[0] = 0
                        if ans_end < ans_start:
                            if si[ans_start] > ei[ans_end]:
                                ei[ans_end] = 0
                                ans_end = ans_start + 1 + torch.argmax(
                                    ei[ans_start + 1:])
                            else:
                                si[ans_start] = 0
                                ans_start = torch.argmax(si[:ans_end])
                        prev_start = ans_start
                        prev_end = ans_end
                        while (ans_end >= ans_start
                               and (ans_end.item() - ans_start.item() + 1) /
                               torch.sum(token_type_ids[i]).item() < want_len):
                            s_big = si[ans_start]
                            e_big = ei[ans_end]
                            si[ans_start] = 0
                            ei[ans_end] = 0
                            ans_start_2 = torch.argmax(si)
                            ans_end_2 = torch.argmax(ei)
                            s_sec = si[ans_start_2]
                            e_sec = ei[ans_end_2]
                            if s_big - s_sec > e_big - e_sec:
                                if ans_end_2 > ans_end and (ans_end.item(
                                ) - ans_start.item() + 1) / torch.sum(
                                        token_type_ids[i]).item() < want_len:
                                    ans_end = ans_end_2
                                if ans_start > ans_start_2 and (ans_end.item(
                                ) - ans_start.item() + 1) / torch.sum(
                                        token_type_ids[i]).item() < want_len:
                                    ans_start = ans_start_2
                            else:
                                if ans_start > ans_start_2 and (ans_end.item(
                                ) - ans_start.item() + 1) / torch.sum(
                                        token_type_ids[i]).item() < want_len:
                                    ans_start = ans_start_2
                                if ans_end_2 > ans_end and (ans_end.item(
                                ) - ans_start.item() + 1) / torch.sum(
                                        token_type_ids[i]).item() < want_len:
                                    ans_end = ans_end_2
                            if ans_end == prev_end and ans_start == prev_start:
                                si[ans_start_2] = 0
                                ei[ans_end_2] = 0
                            prev_start = ans_start
                            prev_end = ans_end
                    new_ans = tokenizer.decode(
                        (x[i][ans_start:ans_end + 1])[:40])

                elif predict_type == "SQUAD" or predict_type == "MOVIE":
                    if predict_type == "MOVIE":
                        si[0] = 0
                        ei[0] = 0
                    ans_start = torch.argmax(si)
                    ans_end = torch.argmax(ei)
                    if ans_start == 0 or ans_end == 0:
                        new_ans = ""
                    else:
                        new_ans = tokenizer.decode(
                            (x[i][ans_start:ans_end + 1])[:40])

                total_ans_len += min(ans_end - ans_start + 1, 40)
                #print("------------- ans_start: %d ---------------" % ans_start)
                #print("------------- ans_end: %d ---------------" % ans_end)
                #print(new_ans)
                if new_ans == "":
                    no_ans += 1
                new_ans = new_ans.replace("[UNK]",
                                          "").replace("[CLS]",
                                                      "").replace("[SEP]", "")
                if predict_type == "MOVIE":
                    if not (ids[i] in all_predict):
                        all_predict[ids[i]] = [""]
                    if new_ans != "":
                        if prev_last == ids[i]:
                            all_predict[ids[i]][-1] += (" " + new_ans)
                        else:
                            all_predict[ids[i]].append(new_ans)
                        if new_ans != "" and ans_end + 1 >= len(
                                ei) - 1:  #get last in sentence
                            prev_last = ids[i]
                        else:
                            prev_last = -1
                else:
                    all_predict[ids[i]] = new_ans

        iter_bar.set_description("Run iter")

    with open(predict_name, "w") as F:
        json.dump(all_predict, F)
    F.close()
    print("NO_ans: ", no_ans)
    print("AVG_len: ", total_ans_len / len(all_predict))
Exemplo n.º 11
0
def write_to_file(predictions):
    with open('test.pred', 'w') as f:
        for p in predictions:
            f.write(str(p) + '\n')
    f.close()
Exemplo n.º 12
0
def main():
    data_2 = '2_qubit_crit_data.npz'
    data_4 = '4_qubit_crit_data.npz'
    data_6 = '6_qubit_crit_data.npz'
    data_7 = '7_qubit_crit_data.npz'
    data_10 = '10_qubit_crit_data.npz'

    data_11 = '11_qubit_crit_data.npz'
    data_12 = '12_qubit_crit_data.npz'

    training_n_sizes = [2, 4, 7]
    validation_n_sizes = [6, 2, 4, 7]

    training_data_2 = get_dataset(data_2, 2, 10000)
    training_data_4 = get_dataset(data_4, 4, 10000)
    training_data_7 = get_dataset(data_7, 7, 10000)

    training_data_2, test_data_2 = random_split(training_data_2, [9000, 1000])
    training_data_4, test_data_4 = random_split(training_data_4, [9000, 1000])
    training_data_7, test_data_7 = random_split(training_data_7, [9000, 1000])

    training_data_2, val_data_2 = random_split(training_data_2, [8000, 1000])
    training_data_4, val_data_4 = random_split(training_data_4, [8000, 1000])
    training_data_7, val_data_7 = random_split(training_data_7, [8000, 1000])
    training_data_10 = get_dataset(data_10, 10, 10000)

    datasets = [training_data_2, training_data_4, training_data_7]

    training_loaders = [
        DataLoader(x, batch_size=32, shuffle=True, num_workers=20)
        for x in datasets
    ]

    val_data_6 = get_dataset(data_6, 6, 10000)
    val_data_6, test_data_6 = random_split(val_data_6, [9000, 1000])
    #val_data_11 = get_dataset(data_11,11,100)
    #val_data_12 = get_dataset(data_12,12,100)

    val_datasets = [val_data_6, val_data_2, val_data_4, val_data_7]
    val_loaders = [
        DataLoader(x, batch_size=10000, num_workers=20) for x in val_datasets
    ]

    test_loader_10 = DataLoader(training_data_10,
                                batch_size=10000,
                                num_workers=20)
    test_datasets = [test_data_2, test_data_4, test_data_6, test_data_7]
    test_loaders = [
        DataLoader(x, batch_size=1000, num_workers=20) for x in test_datasets
    ]
    test_loaders.append(test_loader_10)

    for mps_size in range(2, 11):
        err_name = "{}_dump_errors.p".format(mps_size)
        model_name = "{}_site_model.pt".format(mps_size)

        model, tot_err, val_err, t_errs, val_errs = mps_fit(
            mps_size, training_loaders, training_n_sizes, val_loaders,
            validation_n_sizes)
        loss_func = nn.MSELoss()
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        test_sizes = [2, 4, 6, 7, 10]
        test_errors = {}
        with torch.no_grad():
            for j, loader in enumerate(test_loaders):
                sys_size = test_sizes[j]
                temp = 0
                for i, (fields, wf) in enumerate(loader):
                    fields = fields.to(device)
                    gs = model(fields.to(device), sys_size)
                    loss = loss_func(gs, wf.to(device))
                    print(loss.item() * (2**sys_size))
                    print(gs[0])
                    print(gs[-1])
                    temp += loss.item()

                test_errors[sys_size] = (loss / len(loader))
        test_error_file = open("{}_test_err_.p".format(mps_size), 'wb')
        pickle.dump(test_errors, test_error_file)
        test_error_file.close()

        errs = [tot_err, val_err, t_errs, val_errs]
        torch.save(model.state_dict(), model_name)
        f = open(err_name, 'wb')
        pickle.dump(errs, f)
        f.close()
    '''
Exemplo n.º 13
0
def run_experiment(testChrom, test_num, state_count, cross_fold, INITIAL_T,
                   INITIAL_E, INITIAL_T0):

    # initialize the string which will eventually be a formatted data row in a CSV file
    data_string = testChrom[0:-1] + ", " + str(state_count) + ", " + str(
        test_num) + ", " + str(cross_fold)

    # initialize list which will be the list of observations in the training set
    observations = []

    # initialize observation count
    count = 0

    # flag for determining if we've reached the correct point in the input file for the chromosome
    start = True

    # open file with nucleosome vector sequences
    with open("yeast_vector_sequence_for_HMM.txt") as f:
        # for each like in the file
        for line in f:
            # if we've reached the beginning of the chromosome we're interested in
            if testChrom in line:
                #set flag indicating to start reading in the file
                start = True
            # if its the right kind of line, and the start flag has been set to true, start processing
            if "K36me" not in line and start and "chr" not in line and (
                    "0" in line or "1" in line) and "{" not in line:
                # increment the observation count
                count = count + 1

                # get portion of the line which has the nucleosome vector
                line = line[0:51]

                # add the observation to the list of observations
                observations.append(line.replace(",", ""))

            # if we've reached the end of the chromosome in the file
            if start and testChrom not in line and "chr" in line:
                # break out of the loop, we're done
                start = False

    # close the file
    f.close()

    # is actually 10 because the test seq/validation seq are flipped half the time
    NUMBER_OF_FOLDS = 5

    half_string = "fullchromosome"

    flip_test_seq = False

    adjusted_cross_fold = cross_fold

    if cross_fold >= 5:
        flip_test_seq = True
        adjusted_cross_fold = adjusted_cross_fold - 5

    chunks = list(
        get_chunks(observations, int(len(observations) / NUMBER_OF_FOLDS)))

    while len(chunks) > NUMBER_OF_FOLDS:
        chunks[-2] = chunks[-2] + chunks[-1]
        del chunks[-1]

    held_observations = chunks[adjusted_cross_fold]
    final_test_observations = []
    observations = []

    for i in range(len(chunks)):
        if i != adjusted_cross_fold:
            observations.extend(chunks[i])

    if not flip_test_seq:
        final_test_observations = held_observations[len(held_observations) //
                                                    2:]
        held_observations = held_observations[:len(held_observations) // 2]
    else:
        final_test_observations = held_observations[:len(held_observations) //
                                                    2]
        held_observations = held_observations[len(held_observations) // 2:]

    representedHistones = {}
    representedHistones[12] = True
    representedHistones[8] = True
    representedHistones[3] = True
    representedHistones[17] = True

    # S = the number of states
    S = state_count

    # initialize the model, using the initial matricies generated above
    model = HiddenMarkovModel(INITIAL_T,
                              INITIAL_E,
                              INITIAL_T0,
                              representedHistones,
                              maxStep=200,
                              epsilon=0.1)

    # calculate the time I start running
    start_time = datetime.datetime.now()

    test_index = 12

    # start the learning of the genomic weights, and save the learned matricies
    trans0, transition, emission, final_likelihood, c = model.Baum_Welch_EM(
        observations, held_observations, test_index)

    # print the initial and final matricies, in case I need them later or want to test something
    torch.set_printoptions(profile="full")
    print("Initial Matricies")
    print("INITIAL_T")
    print(INITIAL_T)
    print("INITIAL_E")
    print(INITIAL_E)
    print("INITIAL_T0")
    print(INITIAL_T0)

    print("Final Matricies")
    print("transition")
    print(transition)
    print("emission")
    print(emission)
    print("trans0")
    print(trans0)

    # add the final matricies to a list of all of the best maticires for this state number
    ts_for_experiment.append(transition)
    t0s_for_experiment.append(trans0)
    es_for_experiment.append(emission)

    # calculate and print the total time it took to run
    total_time = datetime.datetime.now() - start_time
    print("total time: " + str(total_time))

    # add the total time and final training seq log likelihood to the data string
    data_string = data_string + ", " + str(total_time) + ", " + str(
        final_likelihood)

    # create a list representing the lengths of the various test sequences I want
    test_sequence_lengths = [5, 10, 20, 100, 0]  # 0 for all

    # dictionary represnting the testing sequence log likelihoods for a test sequence length
    heldBackLikelihoods = dict()

    # for each of these lengths
    for length in test_sequence_lengths:
        # initialize an empty test sequence
        test_seq = []

        # the 0 length is used to represent the whole 10% of the genome
        if length == 0:
            # so set the test_seq and the test_genome_data variables to be the
            # entire dataset held back from the training set
            test_seq = final_test_observations
        else:
            # otherwise, set the variables to a subset of the held back training data
            test_seq = final_test_observations[0:length]

        # print the length we're using
        print("test seq length: " + str(len(test_seq)))

        # get the log likelihood of the test sequence, given the releavant genome data
        held_back_likelihood = model.get_likelihood_of_seq(test_seq)

        # print the log likelihood
        print("final test observations held_back_likelihood")
        print(held_back_likelihood)

        # add the LL to the data string
        data_string = data_string + ", " + str(held_back_likelihood.detach())

        # add the LL for the test sequence length to the dict
        heldBackLikelihoods[length] = held_back_likelihood

    # use a very similar loop to generate some stats for these predictions
    # this very similar loop is separate because of how I like to format the data
    for length in test_sequence_lengths:
        # initialize an empty test sequence
        test_seq = []

        # the 0 length is used to represent the whole 10% of the genome
        if length == 0:
            # so set the test_seq and the test_genome_data variables to be the
            # entire dataset held back from the training set
            test_seq = final_test_observations
        else:
            # otherwise, set the variables to a subset of the held back training data
            test_seq = final_test_observations[0:length]

        # predict an emission sequence, using the guess_sequence_max method, and print the assosciated stats
        print("Logreg Guess Statistics:")

    data_string = data_string + "\n"
    all_results.append(data_string)

    # print the final data string
    print(data_string)

    # return the LL of the longest test seq
    return heldBackLikelihoods[0], model.T, model.E, model.T0
def run_experiment(testChrom, test_num, state_count, random, cross_fold):
    test_index = 12

    # initialize the string which will eventually be a formatted data row in a CSV file
    data_string = testChrom[0:-1] + ", " + str(state_count) + ", " + str(test_num) + ", " + str(cross_fold)  

    # initialize list which will be the list of observations in the training set
    observations = []
    # initialize observation count
    count = 0
    # initialzie list which will represent the genome data for the observation sequence
    observation_genome_data = []

    # flag for determining if we've reached the correct point in the input file for the chromosome
    start = True 

    # initialize values which will represent the average A/T/C/G counts
    avgA = 0
    avgT = 0
    avgC = 0
    avgG = 0

    # open file with nucleosome vector sequences
    with open("yeast_vector_sequence_for_HMM.txt") as f:
        # for each like in the file
        for line in f:
            # if we've reached the beginning of the chromosome we're interested in
            if testChrom in line:
                #set flag indicating to start reading in the file
                start = True
                
            # if its the right kind of line, and the start flag has been set to true, start processing
            if "K36me" not in line and start and "chr" not in line and ("0" in line or "1" in line) and "{" not in line:
                # increment the observation count
                count = count + 1

                # generate an array representing the A/T/C/G counts for this nucleosome
                genomeData = line[52:]
                genomeData = genomeData.strip()
                genomeData = genomeData.split(",")
                genomeData = list(map(int, genomeData))

                # update the total A/T/C/G counts
                # this will be used to compute the average A/T/C/G counts after this loop
                avgA = avgA + genomeData[0]
                avgT = avgT + genomeData[1]
                avgC = avgC + genomeData[2]
                avgG = avgG + genomeData[3]

                # get portion of the line which has the nucleosome vector
                line = line[0:51]

                # add the genome data to the list of genome data entries
                observation_genome_data.append(genomeData)

                # add the observation to the list of observations
                observations.append(line.replace(",", ""))

            # if we've reached the end of the chromosome in the file
            if start and testChrom not in line and "chr" in line:# and "chr2" not in line:
                # break out the loop, we're done
                start = False
                
    # close the file
    f.close()

    # compute the average A/T/C/G counts for this chromosome
    avgA = avgA / len(observations)
    avgT = avgT / len(observations)
    avgC = avgC / len(observations)
    avgG = avgG / len(observations)


    NUMBER_OF_FOLDS = 10

    chunks = get_chunks(observations, int(len(observations)/NUMBER_OF_FOLDS), NUMBER_OF_FOLDS)
    genome_data_chunks = get_chunks(observation_genome_data, int(len(observation_genome_data)/NUMBER_OF_FOLDS), NUMBER_OF_FOLDS)

    final_test_index = cross_fold + 1
    if(final_test_index == len(chunks)):
      final_test_index = 0

    held_observations = chunks[cross_fold]
    held_observations_genome_data = genome_data_chunks[cross_fold]
    final_test_observations = chunks[final_test_index]
    final_test_observations_genome_data = genome_data_chunks[final_test_index]
    observations = []
    observation_genome_data = []

    for i in range(len(chunks)):
        if i != final_test_index:
            observations.extend(chunks[i])
            observation_genome_data.extend(genome_data_chunks[i])

    # S = the number of states
    S = state_count

    # if I've said the random flat when setting up this experiment, then I want to
    # generate random initial matricies, instead of using the ones we learned with the standard HMM
    if random:

        # get random values summing to 1 for each row, using the dirichlet distribution
        INITIAL_T = np.random.dirichlet(np.ones(S), 1)
        for i in range(S - 1):
            INITIAL_T = np.vstack([INITIAL_T, np.random.dirichlet(np.ones(S), 1)])

        # get random values summing to 1 for each row, using the dirichlet distribution
        INITIAL_E = np.random.dirichlet(np.ones(S), 1)
        for i in range(25):
            INITIAL_E = np.vstack([INITIAL_E, np.random.dirichlet(np.ones(S), 1)])

        # get random values summing to 1 for each row (in this case only 1 row), using 
        # the dirichlet distribution
        INITIAL_T0 = np.random.dirichlet(np.ones(S), 1)

    # if we don't want random initial matricies, then we use the ones learned in the standard HMM
    else:
      INITIAL_T, INITIAL_E, INITIAL_T0 = get_premade_matricies(S, cross_fold)


    representedHistones = {}
    representedHistones[12] = True
    representedHistones[8] = True
    representedHistones[3] = True
    representedHistones[17] = True

    # initialize the model, using the initial matricies generated above
    model = HiddenMarkovModel(INITIAL_T, INITIAL_E, INITIAL_T0, representedHistones, maxStep=50, epsilon = 0.001)

    model.final_test = final_test_observations
    model.final_test_data = final_test_observations_genome_data


    # calculate the time I start running
    start_time = datetime.datetime.now()

    # start the learning of the genomic weights, and save the learned matricies
    for c in range(len(chunks)):
      
      if c != final_test_index and c != cross_fold:
        chunk = chunks[c]
        data_chunk = genome_data_chunks[c]

        print("For sequence " + str(c) + " with test sequence " + str(final_test_index)+ " and with validation sequence " + str(cross_fold))

        model.learn_genomic_weights(chunk, data_chunk, avgA, avgT, avgC, avgG, held_observations, held_observations_genome_data, test_index)

    # print the learned genomic weights
    torch.set_printoptions(profile="full")
    print("Best Genomic Weights")
    print(model.genomic_weights)

    # calculate and print how much time it took to run
    total_time = datetime.datetime.now() - start_time
    print("total time: " + str(total_time))

    # add the total time and final log likelihood to formatted csv row 
    data_string = data_string + ", " + str(total_time) + ", " + str(model.scale_plot[-1].detach().numpy())

    # create a list representing the lengths of the various test sequences I want
    test_sequence_lengths = [5, 10, 20, 100, 0] # 0 for all

    # for each of these lengths
    for length in test_sequence_lengths:

        # initialize an empty test sequence
        test_seq = []

        # the 0 length is used to represent the whole 10% of the genome
        if length == 0:
            # so set the test_seq and the test_genome_data variables to be the
            # entire dataset held back from the training set
            test_seq = final_test_observations
            test_genome_data = final_test_observations_genome_data
        else:
            # otherwise, set the variables to a subset of the held back training data 
            test_seq = final_test_observations[0:length]
            test_genome_data = final_test_observations_genome_data[0:length]

        # print the length we're using
        print("test seq length: " + str(len(test_seq)))

        # get the log likelihood of the test sequence, given the releavant genome data
        held_back_likelihood = model.get_likelihood_of_seq(test_seq, test_genome_data)

        # print the log likelihood
        print("held_back_likelihood")
        print(held_back_likelihood)

        # add the log likelihood to the formatted string
        data_string = data_string + ", " + str(held_back_likelihood.detach())


    # use a very similar loop to generate some stats for these predictions
    # this very similar loop is separate because of how I like to format the data
    for length in test_sequence_lengths:

        # initialize an empty test sequence
        test_seq = []

        # the 0 length is used to represent the whole 10% of the genome
        if length == 0:
            # so set the test_seq and the test_genome_data variables to be the
            # entire dataset held back from the training set
            test_seq = final_test_observations
            test_genome_data = final_test_observations_genome_data
        else:
            # otherwise, set the variables to a subset of the held back training data 
            test_seq = final_test_observations[0:length]
            test_genome_data = final_test_observations_genome_data[0:length]

    # print the final data string
    data_string = data_string + "\n"
    print(data_string)

    # it its made it this far, then the experiment was successful
    return model.genomic_weights
Exemplo n.º 15
0
def save_word_embeddings(directory, file, embeddings_index):
    f = open(os.path.join(directory, file), 'w')
    for word, vec in embeddings_index.items():
        f.write(word + ' ' + ' '.join(str(x) for x in vec) + '\n')
    f.close()
Exemplo n.º 16
0
 def memory_write(self, memory_new):
     F = open(self.file_name, 'wb')
     pickle.dump(memory_new, F)
     F.close()
Exemplo n.º 17
0
def Dict2File(Dict, filename):
    F = open(filename, 'a+')
    F.write(str(Dict))
    F.close()