import sys
from PyRouge.Rouge import Rouge

rouge = Rouge.Rouge()


def evaluate(ref_file, pred_file):
    print(ref_file)
    print(pred_file)

    all_pred = []
    all_ref = []

    with open(pred_file, 'r', encoding='utf-8') as pred_reader, \
            open(ref_file, 'r', encoding='utf-8') as ref_reader:
        for src_line, tgt_line in zip(pred_reader, ref_reader):
            src_line = src_line.strip()
            tgt_line = tgt_line.strip()
            sp = src_line.split("\t")
            pred = sp[1]
            tgt_sents = tgt_line.split("##SENT##")
            all_pred.append(pred)
            all_ref.append(' '.join(tgt_sents))

    score = rouge.compute_rouge(all_ref, all_pred)
    print(score)


def main():
    pred_file = r'foo'
    ref_file = r'bar'
示例#2
0
文件: train.py 项目: szxSpark/KGFEs2s
    }
    save_model_path = 'model'
    if opt.save_path:
        if not os.path.exists(opt.save_path):
            os.makedirs(opt.save_path)
        save_model_path = opt.save_path + os.path.sep + save_model_path
    if metric is not None:
        torch.save(checkpoint,
                   '{0}_devRouge_{1}_{2}_e{3}.pt'.format(save_model_path, round(metric[0], 4), round(metric[1], 4),
                                                         epoch))
    else:
        torch.save(checkpoint, '{0}_e{1}.pt'.format(save_model_path, epoch))

evalModelCount = 0
totalBatchCount = 0
rouge_calculator = Rouge.Rouge()

def evalModel(translator, evalData):
    global evalModelCount
    global rouge_calculator
    evalModelCount += 1
    ofn = 'dev.out.{0}.txt'.format(evalModelCount)
    if opt.save_path:
        ofn = os.path.join(opt.save_path, ofn)
    predict, gold = [], []
    processed_data, raw_data = evalData
    # (list(Dataset), list((src_batch, tgt_batch)))
    for batch, raw_batch in tqdm(zip(processed_data, raw_data)):
        # (wrap(srcBatch), lengths), (wrap(tgtBatch), ), indices

        src, tgt, indices = batch[0]
示例#3
0
                break
        data = summarizer.buildData(
            src_batch, src_raw, tgt_raw, None, None
        )  #here call the summerizerfile and builddata codeblock from there where they append
        dataset.append(data)  #src and tgt file to make single datafile
        src_batch, tgt_batch = [], []
        src_raw, tgt_raw = [], []
    srcF.close()
    tgtF.close()
    return dataset


evalModelCount = 0
totalBatchCount = 0
rouge_calculator = Rouge.Rouge(
    use_ngram_buf=True
)  #here is the initialization for batch model and rouge calculator


def evalModel(
    model, summarizer, evalData
):  #this code clock do evaluation of model by calculating prediction score and gold score
    global evalModelCount
    global rouge_calculator
    evalModelCount += 1
    predict, gold = [], []
    predict_id = []
    dataset = evalData
    for data in dataset:
        """
        input: (wrap(srcBatch), lengths, doc_lengths), (src_raw,), \
示例#4
0
    local_sort_sentence, get_fetch_idx
from config import CONFIG as conf
from data_loader import get_train_dev_test_data, read_oracle, read_target_txt
import torch.nn as nn
from PyRouge.Rouge import Rouge
from rouge_score import compute_rouge_score, rouge_eval

batch_size = conf['batch_size']
device = conf['device']
model_path = conf['model_path']
random_seed = conf['random_seed']
exp_name = conf['exp_name']
model_to_load = conf['load_model_path']
mask_pro = conf['mask_pro']
loss_margin = conf['loss_margin']
rouge_calculator = Rouge.Rouge(use_ngram_buf=True)


def compute_score(outs, pool_sent_embeds, masks):
    cos = nn.CosineSimilarity(dim=-1)
    #print(outs)
    #print(pool_sent_embeds)
    all_pos_scores = []
    all_neg_scores = []
    num_corrects = 0
    num_samples = 0
    for i, mask in enumerate(masks):
        mask_idx = torch.arange(len(mask))[mask].long()
        mask_size = len(mask_idx)
        if mask_size > 0:
            mask_pos_out = outs[i][mask_idx]
示例#5
0
def train(config, device):
    logger = getlogger(config.train_log)
    with open(config.word_emb_file, "r") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char2idx_file, "r") as fh:
        char_dict = json.load(fh)

    with open(config.train_title, 'r') as fh:
        temp = fh.readlines()
    rouge_calculator = Rouge.Rouge()
    with open(config.dev_title, 'r') as fh:
        dev_title = fh.readlines()

    logger.info("Building model...")

    train_dataset = Dataset(config.train_token_file, config)
    train_it_num = len(train_dataset) // config.batch_size
    dev_dataset = Dataset(config.dev_token_file, config, train=False)

    dev_it_num = len(dev_dataset) // config.val_batch_size

    char_vocab_size = len(char_dict)
    del char_dict
    model = FastRerank(config.char_dim, char_vocab_size, config.word_len,
                       config.glove_dim, word_mat, config.emb_dim,
                       config.kernel_size, config.encoder_block_num,
                       config.model_block_num).to(device)

    if config.model:
        model.load_state_dict(
            torch.load(os.path.join(config.save_dir, config.model)))

    model.train()
    parameters = filter(lambda param: param.requires_grad, model.parameters())

    optimizer = optim.Adam(weight_decay=config.L2_norm,
                           params=parameters,
                           lr=config.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=10000,
                                                gamma=0.1)

    loss_func = torch.nn.BCEWithLogitsLoss()

    steps = 0
    patience = 0
    losses = 0
    min_loss = 10000
    start_time = time.time()

    for epoch in range(config.epochs):
        batches = train_dataset.gen_batches(config.batch_size, shuffle=True)
        for batch in tqdm(batches, total=train_it_num):
            optimizer.zero_grad()
            (contex_word, contex_char, template_word, template_char, scores,
             ids, contex_mask, template_mask, art_id) = batch
            contex_word, contex_char, template_word, template_char = contex_word.to(
                device), contex_char.to(device), template_word.to(
                    device), template_char.to(device)
            contex_mask, template_mask, scores = contex_mask.to(
                device), template_mask.to(device), scores.to(device)
            p = model(contex_word, contex_char, template_word, template_char,
                      contex_mask, template_mask)

            loss = loss_func(p, scores)
            losses += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(parameters, config.grad_clip)
            scheduler.step()
            optimizer.step()

            if (steps + 1) % config.checkpoint == 0:

                losses = losses / config.checkpoint
                log_ = 'itration {} train loss {}\n'.format(steps, losses)
                logger.info(log_)
                losses = 0
                batches = dev_dataset.gen_batches(config.val_batch_size,
                                                  shuffle=False)
                template = []
                for batch in tqdm(batches, total=dev_it_num):
                    (contex_word, contex_char, template_word, template_char,
                     scores, ids, contex_mask, template_mask, art_id) = batch
                    contex_word, contex_char, template_word, template_char = contex_word.to(
                        device), contex_char.to(device), template_word.to(
                            device), template_char.to(device)
                    contex_mask, template_mask, scores = contex_mask.to(
                        device), template_mask.to(device), scores.to(device)

                    p = model(
                        contex_word,
                        contex_char,
                        template_word,
                        template_char,
                        contex_mask,
                        template_mask,
                    )
                    loss = loss_func(p, scores)
                    losses += loss.item()
                losses /= dev_it_num
                log_ = 'itration {} dev loss {}\n'.format(steps, losses)
                logger.info(log_)

                if losses < min_loss:
                    patience = 0
                    min_loss = losses
                    fn = os.path.join(config.save_dir,
                                      "model_{}.pkl".format(min_loss))
                    torch.save(model.state_dict(), fn)
                else:
                    patience += 1
                    if patience > config.early_stop:
                        print(
                            'early stop because val loss is continuing incresing!'
                        )
                        end_time = time.time()
                        logger.info("total training time{}".format(end_time -
                                                                   start_time))
                        exit()
                losses = 0

            steps += 1
    fn = os.path.join(config.save_dir, "model_final.pkl")
    torch.save(model.state_dict(), fn)