Exemplo n.º 1
0
    def write_performance(self, model_type, metrics, train_time):
        def get_header_string():
            header = [
                'acc', 'mr_f', 'mrr_f', 'mr_p', 'mrr_p', 'pre', 'rec', 'f1',
                'S', 'D', 'B', 'E', 'l_rate', 'data_set', 't_sec', 'model_type'
            ]
            return format_list_to_string(header, '\t')

        def get_perf_string(metrics, train_time):
            content = []
            content.extend(metrics[:-1])
            content.extend([self.n_sense, self.dim, self.batch_size, self.n_epoch, self.learning_rate,\
                            self.regu_strength, self.dropout, self.dataset, train_time, model_type])
            return format_list_to_string(content, '\t')

        # quantitative analysis
        header_string = get_header_string()
        perf_string = get_perf_string(metrics, train_time)
        print header_string + '\n' + perf_string
        file_header = read_first_line(self.perf_file)
        with open(self.perf_file, 'a') as fout:
            if file_header != header_string:
                fout.write(header_string + '\n')
            fout.write(perf_string + '\n')
        # error analysis
        error_indicator = metrics[-1]
        error_indicator_file = self.instance_analysis_path + str(
            model_type) + '.txt'
        ensure_directory_exist(error_indicator_file)
        with open(error_indicator_file, 'w') as fout:
            for element in error_indicator:
                fout.write(str(element) + '\n')
Exemplo n.º 2
0
def write_to_file(instances, x_vocab, y_vocab, output_file):
    ensure_directory_exist(output_file)
    with open(output_file, 'w') as fout:
        for instance in instances:
            instance_string = format_instance_to_string(
                instance, x_vocab, y_vocab)
            fout.write(instance_string + '\n')
Exemplo n.º 3
0
 def write_one_case(self, idx, neighbor_ids, scores):
     neighbor_info = [idx]
     description = self.vocab.get_description(idx)
     neighbor_info.append(description)
     for (neighbor_idx, score) in zip(neighbor_ids, scores):
         description = self.vocab.get_description(neighbor_idx)
         neighbor_info.append([description, score])
     neighbor_info = format_list_to_string(neighbor_info, ' ')
     # print neighbor_info
     ensure_directory_exist(self.case_output_file)
     with open(self.case_output_file, 'a') as fp:
         fp.write(neighbor_info + '\n')
Exemplo n.º 4
0
def write_performance(pd, model_type, metrics, train_time):
    header = ['time', 'acc', 'mr_f', 'mrr_f', 'mr_p', 'mrr_p',\
              'n_sense', 'dim', 'n_epoch', 'data_dir', 'model_type']
    content = [train_time, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4],\
               pd['n_sense'], pd['embedding_dim'], pd['n_epoch'], pd['data_dir'], model_type]
    header_string = format_list_to_string(header, '\t')
    content_string = format_list_to_string(content, '\t')
    print header_string + '\n' + content_string
    # write to file
    perf_file = pd['performance_file']
    ensure_directory_exist(perf_file)
    file_header = read_first_line(perf_file)
    with open(perf_file, 'a') as fout:
        if file_header != header_string:
            fout.write(header_string + '\n')
        fout.write(content_string + '\n')
Exemplo n.º 5
0
 def __init__(self, opt):
     self.opt = opt
     self.n_sense = opt['n_sense']
     self.dim = opt['embedding_dim']
     self.n_epoch = opt['n_epoch']
     self.dataset = opt['data_dir'].split('/')[-2]
     self.batch_size = opt['batch_size']
     self.learning_rate = opt['learning_rate']
     self.regu_strength = opt['regu_strength']
     self.dropout = opt['dropout']
     self.perf_file = opt['data_dir'] + 'output/performance.txt'
     self.instance_analysis_path = opt['data_dir'] + 'instance/'
     ensure_directory_exist(self.perf_file)
     self.test_data_file = opt['data_dir'] + 'input/test.txt'
     self.x_vocab_file = opt['data_dir'] + 'input/words.txt'
     self.criterion = nn.NLLLoss()
Exemplo n.º 6
0
 def find_duplicate_sense_words(self):
     if self.n_sense != 2:
         print 'Similar sense detection is only supported for n_sense = 2.'
         return
     print 'Start detecting duplicate sense for %s words.' % self.n_words
     words = []
     for i in xrange(self.n_words):
         idx = i * self.n_sense + 1
         vec_a = self.emb_matrix[idx]
         vec_b = self.emb_matrix[idx + 1]
         if self.is_similar(vec_a, vec_b, mode='norm'):
             word = self.vocab.get_description(idx)
             words.append(word)
     ensure_directory_exist(self.dup_sense_file)
     with open(self.dup_sense_file, 'w') as fout:
         for w in words:
             fout.write(w + '\n')
Exemplo n.º 7
0
 def write_cluster_members(self, clus, cluster_file, parent_dir):
     n_cluster = clus.n_cluster
     clusters = clus.clusters  # a dict: cluster id -> keywords
     with open(cluster_file, 'w') as fout:
         for clus_id in range(n_cluster):
             members = clusters[clus_id]
             for keyword_id in members:
                 keyword = self.keywords[keyword_id]
                 fout.write(str(clus_id) + '\t' + keyword + '\n')
     # write the cluster for each sub-folder
     clus_centers = clus.center_ids
     for clus_id, center_keyword_id in clus_centers:
         center_keyword = self.keywords[center_keyword_id]
         output_file = parent_dir + center_keyword + '/seed_keywords.txt'
         ensure_directory_exist(output_file)
         members = clusters[clus_id]
         with open(output_file, 'w') as fout:
             for keyword_id in members:
                 keyword = self.keywords[keyword_id]
                 fout.write(keyword + '\n')
Exemplo n.º 8
0
 def write_document_membership(self, clus, output_file, parent_dir):
     n_cluster = clus.n_cluster
     keyword_membership = clus.membership  # an array containing the membership of the keywords
     cluster_document_map = defaultdict(list)  # key: cluster id, value: document list
     with open(output_file, 'w') as fout:
         for idx, doc in zip(self.original_doc_ids, self.documents):
             doc_membership = self.get_doc_membership(n_cluster, doc, keyword_membership)
             cluster_id = self.assign_document(doc_membership)
             cluster_document_map[cluster_id].append(idx)
             fout.write(str(idx) + '\t' + str(cluster_id) + '\n')
     # write the document ids for each sub-folder
     clus_centers = clus.center_ids
     for clus_id, center_keyword_id in clus_centers:
         center_keyword = self.keywords[center_keyword_id]
         output_file = parent_dir + center_keyword + '/doc_ids.txt'
         ensure_directory_exist(output_file)
         doc_ids = cluster_document_map[clus_id]
         with open(output_file, 'w') as fout:
             for doc_id in doc_ids:
                 fout.write(str(doc_id) + '\n')
Exemplo n.º 9
0
def train_neg(train_data, model, criterion, optimizer, model_type, pd):
    forward_time, backward_time = 0, 0
    n_epoch = pd['n_epoch']
    train_log_file = pd['train_log_file']
    ensure_directory_exist(train_log_file)
    with open(train_log_file, 'a') as fout:
        # train
        for epoch in xrange(n_epoch):
            running_loss = 0.0
            for i in xrange(len(train_data)):
                # get the input
                inputs, labels = train_data[i]
                inputs = Variable(torch.LongTensor(inputs))
                noise_labels = train_data.sample_negatives(5, labels[0])
                labels.extend(noise_labels)
                labels = Variable(torch.LongTensor(labels))

                f_start_time = time.time()
                output = model(inputs, labels)
                f_end_time = time.time()
                forward_time += f_end_time - f_start_time

                loss = criterion(output)
                # zero the parameter gradients
                optimizer.zero_grad()
                # backward + optimize

                b_start_time = time.time()
                loss.backward()
                optimizer.step()
                b_end_time = time.time()
                backward_time += (b_end_time - b_start_time)

                # print statistics
                running_loss += loss.data[0]
                if (i + 1) % 2000 == 0:
                    print('%20s [%d, %5d]  training loss: %.3f' % (model_type, epoch+1, i+1, running_loss/2000))
                    fout.write('%20s [%d, %5d]  training loss: %.3f\n' % (model_type, epoch+1, i+1, running_loss/20))
                    running_loss = 0.0
    print 'forward time:', forward_time
    print 'backward time:', backward_time
Exemplo n.º 10
0
 def save_model(self, model, model_type):
     model_name = self.get_model_name(model_type)
     file_name = self.model_path + model_name
     ensure_directory_exist(file_name)
     torch.save(model.state_dict(), file_name)