示例#1
0
文件: wiki.py 项目: OPI-LIL/word2vec
    # check and process input arguments
    args = parse_args(sys.argv[1:])

    for arg in ['input', 'output']:
        if not arg in args:
            logger.error('Argument ' + arg + ' is missing')
            sys.exit(1)

    limit, workers, min_count, size, input, output = args['limit'], args['workers'], args['min_count'], \
                                                     args['size'], args['input'], args['output']

    # log arguments
    logger.info('Training with: ' + ' '.join([k + " : " + str(v) for k, v in args.iteritems()]))

    # import sentences
    sentences = slice(LineSentence(input), limit);

    try:
        # train model
        model = Word2Vec(sentences, min_count=min_count, workers=workers, size=size)

        # Save model
        logger.info('Saving model to file')
        model.save(output + "_" + str(limit))

        logger.info('Model has been saved.')

    except Exception as exc:
        logger.exception('Exception in model training: ' + str(exc))

    logger.info('Done!')
示例#2
0
def train_net(net,
              epochs=5,
              batch_size=1,
              lr=0.1,
              val_percent=0.05,
              save_cp=True,
              gpu=False,
              epoch_size=10,
              window=512,
              obs_size=10):

    print('''Starting training:
                Epochs: {}
                Batch : {}
                Learning rate: {}
                Checkpoints: {}
                CUDA: {}
            '''.format(epochs, batch_size, lr, str(save_cp), str(gpu)))

    dir_checkpoint = 'checkpoints/'

    dataset = HelioDataset('data/SIDC_dataset.csv', 'data/fenyi', epoch_size)

    optimizer = optim.SGD(net.parameters(),
                          lr=lr,
                          momentum=0.9,
                          weight_decay=0.0005)

    bce = nn.BCELoss()

    data_loader = DataLoader(dataset)

    for epoch in range(epochs):
        print('Starting epoch {}/{}.'.format(epoch + 1, epochs))
        net.train()

        for _, obs in enumerate(data_loader):
            obs = keep_best(slice(obs, window, window // 2), obs_size)

            for idx in range(0, len(obs['imgs']), batch_size):
                imgs = obs['imgs'][idx:idx + batch_size].float()
                true_masks = obs['masks'][idx:idx + batch_size].float()

                if gpu:
                    imgs = imgs.cuda()
                    true_masks = true_masks.cuda()

                masks_pred = net(imgs)
                masks_probs_flat = masks_pred.view(-1)

                true_masks_flat = true_masks.view(-1)

                loss = bce(masks_probs_flat, true_masks_flat)

                print(int(idx), '-', int(idx + batch_size),
                      '> loss: {0:.6f}'.format(loss.item()))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        print('Epoch finished!')

        #if 1:
        #    val_dice = eval_net(net, val, gpu)
        #    print('Validation Dice Coeff: {}'.format(val_dice))

        if save_cp:
            torch.save(net.state_dict(),
                       dir_checkpoint + 'CP512-{}.pth'.format(epoch + 1))
            print('Checkpoint {} saved !'.format(epoch + 1))
示例#3
0
if __name__ == '__main__':

    # set up logging
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running: %s" % ' '.join(sys.argv))

    # check and process input arguments
    args = parse_args(sys.argv[1:])

    if not 'input' in args:
        logger.error("No input given!")
        sys.exit(1)

    # get args
    inp, outp, limit = args['input'], args['output'], args['limit']

    # prepare corpus
    wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
    texts = slice(wiki.get_texts(), limit);

    # save this for efficiency
    space = " "
    output = open(outp, 'w')
    iterate_with_logging(logger, 10000, texts,
                 lambda text: output.write(space.join(text) + "\n"))

    output.close()