示例#1
0
def write_netfile(wtoi, data_set, word_file_path, idx_file_path):
    d = {}
    for doc in data_set:
        words = list(set(doc))
        for i in range(len(words)-1):
            for j in range( i+1 , len(words)):
                pair_1 = tuple((words[i] , words[j]))
                pair_2 = tuple((words[j] , words[i]))
                if pair_1 in d:
                    d[pair_1] += 1
                    d[pair_2] += 1
                else:
                    d[pair_1] = 1
                    d[pair_2] = 1
    trace('{} pair(edges) catched.'.format(len(d)), file=config.log_file)
    with open(word_file_path, 'w') as writer1 , open(idx_file_path, 'w') as writer2:
        for pair_set , value in d.items():
            if value<5:
                continue
            w1,w2 = pair_set
            writer1.write('{} {} {}\n'.format(w1,w2,value))
            writer2.write('{} {} {}\n'.format(wtoi[w1],wtoi[w2],value))
            #w.write('{} {} {}\n'.format(w2,w1,value))
        writer1.close()
        writer2.close()
示例#2
0
def train_model(model, optimizer, loss_func, train_data_iter, valid_data_iter,
                config):

    trainer = Trainer(model, loss_func, optimizer, config)

    for epoch in range(1, config.epochs + 1):
        train_iter = iter(train_data_iter)
        valid_iter = iter(valid_data_iter)

        # train
        train_stats = trainer.train(train_iter, epoch,
                                    train_data_iter.num_batches)

        print('')
        trace('Epoch %d, Train acc: %g, ppl: %g' %
              (epoch, train_stats.accuracy(), train_stats.ppl()))

        # validate
        valid_stats = trainer.validate(valid_iter)
        trace('Epoch %d, Valid acc: %g, ppl: %g' %
              (epoch, valid_stats.accuracy(), valid_stats.ppl()))

        # # log
        # train_stats.log("train", config.model_name, optimizer.lr)
        # valid_stats.log("valid", config.model_name, optimizer.lr)

        # update the learning rate
        trainer.lr_step(valid_stats.ppl(), epoch)

        # dump a checkpoint if needed.
        trainer.dump_checkpoint(epoch, config, train_stats)
示例#3
0
 def make_vocab(self):
     if self.cur_counter is None:
         pass
     else:    
         i = 0
         for w in self.cur_counter.keys():
             self.itow[i] = w
             self.wtoi[w] = i
             i += 1
         trace('made vocab size {}'.format(len(self.itow)), self.config.log_file)
示例#4
0
 def update_dataset(self):
     new_times = [] ; new_docs = []
     old_c = len(self.times)
     for time, doc in zip(self.times, self.docs):
         new_doc = [word for word in doc.split() if word in self.vocab]
         if len(new_doc)<self.config.min_num_words:
             continue
         else:
             new_times.append(time)
             new_docs.append(new_doc)
     self.times = new_times
     self.docs = new_docs
     new_c = len(self.times)
     trace('update dataset from {} to {}'.format(old_c, new_c), self.config.log_file)  
示例#5
0
def build_twnp(assignments, centroids, nodes_i2w, word_emb, wtoi):
    num_topics = len(centroids) ; num_words = len(wtoi)
    trace('tw_np shape: {} x {} , non-zero value(word_emb) : {}'.format(num_topics, num_words, len(word_emb)), file=config.log_file)
    tw_np = np.zeros((num_topics, num_words))
    tw_np = neg_init(tw_np, -99)
    print('negatived np')
    
    for ass_idx, node_indices in assignments.items():
        for node_idx in node_indices:
            word = nodes_i2w[node_idx] 
            word_idx = wtoi[word]
                
            dist = distance(centroids[ass_idx], word_emb[word])
            tw_np[ass_idx][word_idx] = -dist
    return tw_np
示例#6
0
    def update_lr(self, ppl, epoch):
        """
        Decay learning rate if val perf does not improve
        or we hit the start_decay_at limit.
        """

        if self.start_decay_at is not None and epoch >= self.start_decay_at:
            self.start_decay = True
        # if self.last_ppl is not None and ppl > self.last_ppl:
        #     self.start_decay = True

        if self.start_decay:
            self.lr = self.lr * self.lr_decay_rate
            trace("Decaying learning rate to %g" % self.lr)

        self.last_ppl = ppl
        if self.method != 'sparseadam':
            self.optimizer.param_groups[0]['lr'] = self.lr
示例#7
0
def main():
    # Load checkpoint if we resume from a previous training.

    args, parser = parse_args("train")
    config = read_config(args, parser, args.config)
    trace(format_config(config))
    train_data_iter = DataBatchIterator(config=config,
                                        is_train=True,
                                        dataset="train",
                                        batch_size=config.batch_size,
                                        shuffle=True)
    train_data_iter.load()

    src_vocab = train_data_iter.src_vocab
    trg_vocab = train_data_iter.trg_vocab

    check_save_path(config.save_vocab)
    torch.save(src_vocab, config.save_vocab + "." + config.src_lang)
    torch.save(trg_vocab, config.save_vocab + "." + config.trg_lang)
    valid_data_iter = DataBatchIterator(config=config,
                                        is_train=True,
                                        dataset="dev",
                                        batch_size=config.valid_batch_size)
    valid_data_iter.set_vocab(src_vocab, trg_vocab)
    valid_data_iter.load()

    # Build model.
    model = model_factory(config, src_vocab, trg_vocab)
    # if len(config.gpu_ids) > 1:
    #     trace('Multi gpu training: ', config.gpu_ids)
    #     model = nn.DataParallel(model, device_ids=config.gpu_ids, dim=1)

    trace(model)

    # Build optimizer.
    optimizer = build_optimizer(model, config)

    padding_idx = trg_vocab.stoi[PAD_WORD]
    # Build loss functions for training set and validation set.
    loss_func = NMTLoss(config, padding_idx)
    # Do training.
    train_model(model, optimizer, loss_func, train_data_iter, valid_data_iter,
                config)
示例#8
0
 def seg(self, unit=None, time_period=None):
     
     # according to 'self' unit and period
     d = {}
     for t_i, time in enumerate(self.times):
         new_datetime = trans_datetime(cur_date=time, unit=self.config.unit,
                                       time_period=self.config.time_period,start_date=self.start_date)
         if new_datetime in d:
             d[new_datetime].append(t_i)
         else:
             d[new_datetime] = [t_i]
     #d = list(filter(lambda x:len(x[1])>self.config.min_num_docs, d.items()))
     d = sorted(d.items(), key=lambda x:x[0])
     #print(d)
     self.sorted_idx = []
     self.time_slices = []
     self.times_tag = []
     for i in range(len(d)-1):
         if len(d[i][1])<self.config.min_num_docs:
             continue
         self.time_slices.append(len(d[i][1]))
         self.sorted_idx += d[i][1]
         self.times_tag.append(str(d[i][0]))
     # cut for test set
     i = -1 ;  cut=2000
     if len(d[i][1])<cut:
         self.time_slices.append(len(d[i][1]))
         self.sorted_idx += d[i][1]
         self.times_tag.append(str(d[i][0])+'_test')
     else:
         self.time_slices.append(len(d[i][1][:-cut]))
         self.sorted_idx += d[i][1][:-cut]
         self.times_tag.append(str(d[i][0]))
         
         self.time_slices.append(len(d[i][1][-cut:]))
         self.sorted_idx += d[i][1][-cut:]
         self.times_tag.append(str(d[i][0])+'_test')          
         
     trace('seg documents({}) to {} slices [{}] based on [{}]'.format(
             sum(self.time_slices), len(self.time_slices),
             ','.join([str(i) for i in self.time_slices]),
             ','.join(self.times_tag)),
             self.config.log_file)
示例#9
0
def read_config(args, args_parser, config_file=None):

    if config_file is None:
        return args_parser.parse_args()
    if not os.path.isfile(config_file):
        trace("""# Cannot find the configuration file. 
            {} does not exist! Please check.""".format(config_file))
        sys.exit(1)
    config = SafeConfigParser()
    config.read(config_file)
    for section in config.sections():
        default = get_correct_args(config, config.items(section), section)
        args_parser.set_defaults(
            **{
                k: v
                for k, v in filter(lambda x: hasattr(args, x[0]),
                                   default.items())
            })

    args = args_parser.parse_args()
    return args
示例#10
0
def main():
    trace('---train topics---', config.log_file)
    model = DtmModel(dtm_path,
                     corpus=gensim_data.corpus,
                     id2word=gensim_data.dictionary,
                     time_slices=train_set.time_slices[:-1],
                     num_topics=config.z_dim,
                     lda_sequence_min_iter=50,
                     lda_sequence_max_iter=config.epochs)
    trace('---model trained---', config.log_file)
    #
    sample_topic = model.dtm_coherence(time=0, num_words=10)
    print('sample topic is like: {}'.format(' '.join(sample_topic[0])),
          config.log_file)

    #
    tw_nps = model.show_topics(num_topics=config.z_dim,
                               times=-1,
                               num_words=train_set.vocab_size(),
                               formatted=False)

    for t in range(T):
        # topics in time t
        tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim]

        tw_np = get_topic_np(tw_np, config.z_dim,
                             gensim_data.dictionary.token2id)
        tw_tensor = torch.from_numpy(tw_np)
        tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary)

        # coh
        cohs_t = get_cohs(tw_list_t)
        p = ppl(gensim_data.test, tw_tensor)

        TWmatrix.append(tw_np)
        TWlist.append(tw_list_t)
        COHs.append(cohs_t)
        PPLs.append(p)

        avg_COHs.append((sum(cohs_t) / len(cohs_t)))

        seg = '---------- topics in time {}/{} ----------'.format(t + 1, T)
        display_topics(tw_list=tw_list_t,
                       cohs=cohs_t,
                       head='topics',
                       seg=seg,
                       file=config.topic_file)
        trace('topic result(coherence) written.', file=config.log_file)

    p_file = os.path.join(config.output_path, 'ppl.jpg')
    draw_ppl(PPLs, title='perplexities over time', file=p_file)
    a_file = os.path.join(config.output_path, 'avg_coh.jpg')
    draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
示例#11
0
    def log(self, sent_number):
        """
        Log translation to stdout.
        """
        output = '\nINPUT {}: {}\n'.format(sent_number,
                                           " ".join(self.src_sent))

        best_pred = self.pred_sents[0]
        best_score = self.pred_scores[0]
        pred_sent = ' '.join(best_pred)
        output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
        trace("PRED SCORE: {:.4f}".format(best_score))

        if self.gold_sent is not None:
            trg_sent = ' '.join(self.gold_sent)
            output += 'GOLD {}: {}\n'.format(sent_number, trg_sent)
            # output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
            trace("GOLD SCORE: {:.4f}".format(self.gold_score))
        if len(self.pred_sents) > 1:
            trace('\nBEST HYP:')
            for score, sent in zip(self.pred_scores, self.pred_sents):
                output += "[{:.4f}] {}\n".format(score, sent)

        return output
示例#12
0
def main():
    args, parser = parse_args("translate")
    config = read_config(args, parser, args.config)
    config.batch_size = 1
    test_data_iter = DataBatchIterator(config=config,
                                       is_train=False,
                                       dataset="test",
                                       batch_size=config.batch_size)

    src_vocab = torch.load(config.save_vocab + "." + config.src_lang)
    trg_vocab = torch.load(config.save_vocab + "." + config.trg_lang)

    test_data_iter.set_vocab(src_vocab, trg_vocab)
    test_data_iter.load()

    checkpoint = torch.load(config.save_model + ".pt")
    # Load the model.
    model = model_factory(config,
                          src_vocab,
                          trg_vocab,
                          train_mode=False,
                          checkpoint=checkpoint)
    if config.verbose:
        trace(model)
    # File to write sentences to.
    pred_file = codecs.open(config.output + ".pred.txt", 'w', 'utf-8')
    ref_file = codecs.open(config.output + ".ref.txt", 'w', 'utf-8')
    src_file = codecs.open(config.output + ".src.txt", 'w', 'utf-8')
    # Sort batch by decreasing lengths of sentence required by pytorch.
    # sort=False means "Use dataset's sortkey instead of iterator's".

    # Translator
    scorer = GNMTGlobalScorer(config.alpha, config.beta,
                              config.coverage_penalty, config.length_penalty)
    translator = BatchTranslator(model,
                                 config,
                                 trg_vocab,
                                 global_scorer=scorer)

    data_iter = iter(test_data_iter)

    builder = TranslationBuilder(src_vocab, trg_vocab, config)

    # Statistics
    counter = count(1)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0

    pred_list = []
    gold_list = []
    for batch in data_iter:
        outputs = translator.translate_batch(batch)
        batch_trans = builder.from_batch_translator_output(outputs)

        for trans in batch_trans:
            pred_score_total += trans.pred_scores[0]
            pred_words_total += len(trans.pred_sents[0])
            pred_list.append(trans.pred_sents[0])

            gold_score_total += trans.gold_score
            gold_words_total += len(trans.gold_sent) + 1
            gold_list.append(trans.gold_sent)

            k_best_preds = [
                " ".join(pred) for pred in trans.pred_sents[:config.k_best]
            ]
            #print(" ".join(trans.gold_sent)
            pred_file.write('\n'.join(k_best_preds) + "\n")
            ref_file.write(" ".join(trans.gold_sent) + '\n')
            src_file.write(" ".join(trans.src_sent) + '\n')
            if config.verbose:
                sent_number = next(counter)
                output = trans.log(sent_number)
                os.write(1, output.encode('utf-8'))

                report_score('PRED', pred_score_total, pred_words_total)
                report_score('GOLD', gold_score_total, gold_words_total)
            if config.plot_attn:
                plot_attn(trans.src_sent, trans.pred_sents[0],
                          trans.attns[0].cpu())
            #break
        #break
    report_bleu(gold_list, pred_list)
    report_rouge(gold_list, pred_list)
示例#13
0
#        PPLs.append(p)
#    seg = '---------- topics in time {}/{} ----------'.format(t+1, T)
#    display_topics(tw_list=tw_list_t, cohs=None, coh_name='(ppl: {})'.format('%.4f' % p),
#                   head='topics', seg=seg,
#                   file = config.topic_file)

if __name__ == '__main__':

    # configuration
    args, parser = parse_args()
    config_file = 'config/tryconfig.ini'
    global config
    config = read_config(args, parser, config_file)

    s = 'Start running m_DTM.py \n {}\n'.format(str(config))
    trace(s, file=config.log_file, write='w')

    global dtm_path
    dtm_path = os.path.join(project_path, 'dtm/dtm/main')

    # make dataset
    train_set = Dataset(config)
    global T
    T = train_set.T()

    global TWmatrix, TWlist, COHs, PPLs, avg_COHs
    TWmatrix = []
    TWlist = []
    COHs = []
    PPLs = []
    avg_COHs = []
示例#14
0
def main(t, T, train_data, widget, whole_wtoi, whole_itow):
    
    # Create dict of distribution when opening file
#    edge_dist_dict, node_dist_dict, weights, nodedegrees, maxindex = makeDist(
#        config.graph_path, config.negativepower)
#    edges_alias_sampler = VoseAlias(edge_dist_dict)
#    nodes_alias_sampler = VoseAlias(node_dist_dict)     
    
    # choose graph type
    #model = choose_graph(config)
    model = config.graph
    print('model: ', model)
    # build input for graph embedding
    if model == 'TopicMap':
        graph_file= os.path.join(config.output_path, 'temp_graph_file_{}.txt'.format(t))
    elif model == 'LINEs':
        graph_file = 'temp_graph_file_{}.txt'.format(t)
    elif model == 'PyGCN':
        graph_file = 'temp_graph_file_{}.txt'.format(t)
    elif model == 'MyGCN':
        graph_file = 'temp_graph_file_{}.txt'.format(t)
    
        
    geinput = build_geinput(model, train_data, whole_wtoi, graph_file)
    
    if model == 'TopicMap':
        centroids = 0
        git_path = os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
        topicmap_path = os.path.join(git_path, 'topicmapping/bin/topicmap')
        emb_dir = os.path.join(config.output_path, 'result_{}'.format(t))
        cmd = '{} -f {} -t 10 -o {}'.format(
                topicmap_path, graph_file, emb_dir)
        print(cmd)
        subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        #centroids = subp.stdout.readlines() 
        write_topicmap(config.topic_file, config.output_path, T)

    elif model == 'LINEs' or model == 'PyGCN':
        # get embedding
        emb_file = 'temp_emb_file.txt'
        run_cmd(config, model, graph_file, emb_file, geinput)
        nodes, nodes_i2w, word_emb = read_emb(emb_file)
        # clustering
        if FLAG:
            widget = widget
        else:
            widget = None
        centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget)
        os.remove(emb_file)
        
    elif model == 'MyGCN':
        A_matrix, X_matrix, cur_wtoi, cur_itow = geinput
        if FLAG:
            print('new shape of X_matrix is: {}'.format(X_matrix.shape))
            init_weight_11 = np.random.random((len(cur_wtoi), config.h_dim))
            a=0; b=0
            for word, cur_w_i in cur_wtoi.items():
                memory_w_v = MemoryVD[whole_wtoi[word]]
                if np.all(memory_w_v==0):
                    b+=1
                else:
                    a+=1
                    init_weight_11[cur_w_i] = memory_w_v
            s = "inherit {} word embeddings and random {}".format(a,b)
            trace(s, file=config.log_file)
            w12, w21 = widget
            init_weight = (init_weight_11, w12, w21)
        else:
#            init_weight = np.random.random((len(whole_wtoi), config.h_dim))
#            print(init_weight)
            init_weight = None
            
        N_ , F_ = X_matrix.shape ; D_ = config.h_dim 
        print(N_, F_, D_)
            
        model_gcn = GCN_model(config, n_dim=N_, d_dim=D_, f_dim=F_,
                              init_weight_np=init_weight)
#        print(model_gcn.weight_11.data)
        model_gcn.to(config.device)
        optimizer = torch.optim.Adam(model_gcn.parameters(), lr=config.lr)
        model_gcn.train()
        for epoch in range(config.epochs):
            b = 5 ; k = math.ceil(N_/b) #; print('k: ',k)
            for  batch in range(2*b):
                cur_a, chosen_idx = sample_from_matrix(A_matrix, k=k)
                cur_x, _ = sample_from_matrix(X_matrix, k=k, chosen_idx=chosen_idx)
                optimizer.zero_grad()
                cur_inputs = (torch.tensor(cur_a).to(config.device), torch.tensor(cur_x).to(config.device))
#                print(cur_inputs[0].dtype)
#                print(model_gcn.weight_11.data.dtype)
                rec, loss = model_gcn(cur_inputs)
                loss.backward()
                optimizer.step()
            cur_inputs = (torch.tensor(A_matrix).to(config.device), torch.tensor(X_matrix).to(config.device))
            optimizer.zero_grad()
            rec, loss = model_gcn(cur_inputs)
            loss.backward()
            optimizer.step()
            if epoch%10==0:
                s = "epoch:{}, loss:{}".format(epoch, loss)
                trace(s, file=config.log_file)
                
        # update memory VD (node/word embedding)
        nd_matrix, w12, w21 = model_gcn.get_widget()
#        print('nd_matrix\n', nd_matrix) 
        c = 0 ; d = 0
        for cur_w_i, word in cur_itow.items():
            memory_w_i = whole_wtoi[word]
            cur_v = nd_matrix[cur_w_i]
            if np.all(cur_v==0):
                c += 1
            else:
                MemoryVD[memory_w_i] = cur_v
                d += 1
        trace('update {}/{} words from cur to memory'.format(d, c))
            
            
        # write emb_file
        emb_file = 'temp_emb_file.txt'
        with open(emb_file, 'w') as writer:
            writer.write('{} {}'.format(N_, config.h_dim))
            for row_i, row in enumerate(nd_matrix):
                word = cur_itow[row_i]
                vector_str = [str(item) for item in row]
                s = "{} {}\n".format(word, ' '.join(vector_str))
                writer.write(s)  
            
        # clustering
        nodes, nodes_i2w, word_emb = read_emb(emb_file)
        centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget)
        return (w12, w21)
    #os.remove(graph_file)
    return centroids
示例#15
0
        centroids = cluster_ge(t, nodes, nodes_i2w, word_emb, emb_file, wtoi=whole_wtoi, itow=whole_itow, init_c=widget)
        return (w12, w21)
    #os.remove(graph_file)
    return centroids
    
    

if __name__ == '__main__':
    
    # configuration
    args, parser = parse_args()
    config_file = 'config/tryconfig.ini'
    global config
    config = read_config(args, parser, config_file)
    s = 'Start running ges.py \n {}\n'.format(str(config))
    trace(s,file=config.log_file, write='w')
    global T
    train_set = Dataset(config)
    T = train_set.T()
    global TWmatrix, TWlist, COHs, PPLs, avg_COHs
    TWmatrix = [] ; TWlist = []; COHs = []; PPLs = []
    avg_COHs = []
    
    name = Name(flag=config.flag, config=config,
                model_name=config.graph, data_name=config.train_file, time_slices=train_set.time_slices[:-1])
    result = Result(info=name, TWmatrix=TWmatrix, itow=train_set.itow,
                    twlist=TWlist, COHs=COHs, PPLs=PPLs)
    result_file = os.path.join(config.output_path, 'result')

    if config.graph == 'MyGCN':
        global MemoryVD, V
示例#16
0
 def read(self, file, dataname='arxiv', prepcs=False):
     if dataname == 'arxiv':
         with open(file, 'r') as reader:
             if prepcs:
                 pass
             else:
                 line_counter = 0 ; valid_counter = 0
                 for line in reader:
                     line_counter += 1
                     tokens = line.strip().split('\t')
                     time = tokens[0]
                     text = tokens[-1]
                     time = to_datetime(time, dataname = 'arxiv')
                     if time == None:
                         continue
                     if time<self.set_start or time>self.set_end:
                         continue
                     text, wc = cleanStr(text, self.config.deli)
                     if wc < self.config.min_num_words:
                         continue
                     self.times.append(time)
                     self.docs.append(text)  
                     self.counter.update(text.strip().split())
                     valid_counter += 1
                     if time < self.start_date:
                         self.start_date = time
                     elif time > self.end_date:
                         self.end_date = time                    
         reader.close()
         info = 'extract {}/{} lines from {}'.format(valid_counter, line_counter, file)
         trace(info, self.config.log_file)
         
     elif dataname == 'care':
         with open(file, 'r') as reader:
             if prepcs:
                 pass
             else:
                 line_counter = 0 ; valid_counter = 0
                 for line in reader:
                     line_counter += 1
                     tokens = line.strip().split('\t')
                     time = tokens[0]
                     if len(tokens) == 2:
                         text = tokens[-1]
                     elif len(tokens) == 3:
                         text = tokens[1] + ' ' + tokens[2]
                     else:
                         continue
                     time = to_datetime(time, dataname='care')
                     if time == None:
                         continue
                     if time<self.set_start or time>self.set_end:
                         continue
                     text, wc = cleanStr(text)
                     if wc < self.config.min_num_words:
                         continue
                     self.times.append(time)
                     self.docs.append(text)  
                     self.counter.update(text.strip().split())
                     valid_counter += 1
                     if time < self.start_date:
                         self.start_date = time
                     elif time > self.end_date:
                         self.end_date = time    
         reader.close()
         info = 'extract {}/{} lines from {}'.format(valid_counter, line_counter, file)
         trace(info, self.config.log_file)           
         
     else:
         info = 'unrecgonized argument dataname'
         trace(info, self.config.log_file)
         pass