示例#1
0
def main():
    from src.config import load_parser
    parser = load_parser()
    args, unknown = parser.parse_known_args()
    args = vars(args)

    args_to_print = {name: args[name] for name in args if not "files" in name}
    pprint(args_to_print)
    pprint(unknown)

    if args['data_header']:
        with open(args['data_header']) as f:
            data_header = f.readlines()[0].strip().split(",")
    if args['label_header']:
        with open(args['label_header']) as f:
            label_header = f.readlines()[0].strip().split(",")

    train_data_files = args['train_data_files']
    train_image_files = args['train_image_files']
    train_text_files = args['train_text_files']
    train_label_files = args['train_label_files']
    key = args['key']

    user_size = args['user_size']
    text_size = args['text_size']
    image_size = args['image_size']
    dummy_user_vector = args['dummy_user_vector']
    shuffle = args['shuffle']
    batch_size = args['batch_size']
    num_workers = args['num_workers']

    train_data_files[6:], train_image_files[6:], train_text_files[
        6:], train_label_files[6:] = get_overlapping_data_files(
            train_data_files, train_image_files, train_text_files,
            train_label_files)

    for train_data_file, train_image_file, train_text_file, train_label_file in tqdm(
            zip(train_data_files, train_image_files, train_text_files,
                train_label_files),
            desc="files"):
        dataset = TwitterDatasetChunk(data_file=train_data_file,
                                      image_file=train_image_file,
                                      text_file=train_text_file,
                                      label_file=train_label_file,
                                      key=key,
                                      data_header=data_header,
                                      label_header=label_header,
                                      user_size=user_size,
                                      text_size=text_size,
                                      image_size=image_size,
                                      dummy_user_vector=dummy_user_vector)
        dataloader = DataLoader(dataset,
                                batch_size=args['batch_size'],
                                shuffle=args['shuffle'],
                                num_workers=args['num_workers'])

        for batch in tqdm(dataloader, desc='dataloader'):
            pass
示例#2
0
def main():
    from src.config import load_parser
    parser = load_parser()
    args, unknown = parser.parse_known_args()
    args = vars(args)

    args_to_print = {name: args[name] for name in args if not "files" in name}
    pprint(args_to_print)
    pprint(unknown)

    if args['data_header']:
        with open(args['data_header']) as f:
            data_header = f.readlines()[0].strip().split(",")
    if args['label_header']:
        with open(args['label_header']) as f:
            label_header = f.readlines()[0].strip().split(",")

    train_data_files = args['train_data_files']
    train_image_files = args['train_image_files']
    train_text_files = args['train_text_files']
    train_label_files = args['train_label_files']
    key = args['key']

    user_size = args['user_size']
    text_size = args['text_size']
    image_size = args['image_size']
    dummy_user_vector = args['dummy_user_vector']
    shuffle = args['shuffle']
    batch_size = args['batch_size']
    num_workers = args['num_workers']

    train_data_files, train_image_files, train_text_files, train_label_files = get_overlapping_data_files(
        train_data_files, train_image_files, train_text_files,
        train_label_files)

    data_loader = TwitterDataloader(
        data_files=train_data_files,
        image_files=train_image_files,
        text_files=train_text_files,
        label_files=train_label_files,
        key=key,
        data_header=data_header,
        label_header=label_header,
        user_size=user_size,
        text_size=text_size,
        image_size=image_size,
        dummy_user_vector=dummy_user_vector,
        shuffle=shuffle,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    for batch in tqdm(data_loader):
        pass
    print("Finished going through all files...")
示例#3
0
def main():
    from pprint import pprint
    from src.config import load_parser

    parser = load_parser()
    args, unknown = parser.parse_known_args()
    args = vars(args)

    model = UserModel(user_size=args['user_size'],
                      hidden_size=args['hidden_size'],
                      joint_embedding_size=args['joint_embedding_size'])
示例#4
0
def main():
  from pprint import pprint
  from src.config import load_parser
  parser = load_parser()
  args, unknown = parser.parse_known_args()
  args = vars(args)

  model = ContentModel(
    image_embed_size=args['image_size'],
    text_embed_size=args['text_size'],
    hidden_size=args['hidden_size'],
    joint_embedding_size=args['joint_embedding_size'])
示例#5
0
def main():

    from src.config import load_parser
    parser = load_parser()
    args, unknown = parser.parse_known_args()
    args = vars(args)

    if args['data_header']:
        with open(args['data_header']) as f:
            data_header = f.readlines()[0].strip().split(",")
    if args['label_header']:
        with open(args['label_header']) as f:
            label_header = f.readlines()[0].strip().split(",")

    data_files = args['valid_data_files']
    image_files = args['valid_image_files']
    text_files = args['valid_text_files']
    label_files = args['valid_label_files']
    key = args['key']
    user_size = args['user_size']
    text_size = args['text_size']
    image_size = args['image_size']
    dummy_user_vector = args['dummy_user_vector']

    for data_file, image_file, text_file, label_file in tqdm(
            zip(data_files, image_files, text_files, label_files)):
        dataset = TwitterDatasetChunk(data_file=data_file,
                                      image_file=image_file,
                                      text_file=text_file,
                                      label_file=label_file,
                                      key=key,
                                      data_header=data_header,
                                      label_header=label_header,
                                      user_size=user_size,
                                      text_size=text_size,
                                      image_size=image_size,
                                      dummy_user_vector=dummy_user_vector)
        if not len(dataset):
            import ipdb
            ipdb.set_trace()
示例#6
0
def main():
    from pprint import pprint
    from src.config import load_parser
    parser = load_parser()
    args, unknown = parser.parse_known_args()
    args = vars(args)

    args_to_print = {name: args[name] for name in args if not "files" in name}
    pprint(args_to_print)
    pprint(unknown)

    if args['data_header']:
        with open(args['data_header']) as f:
            data_header = f.readlines()[0].strip().split(",")
    if args['label_header']:
        with open(args['label_header']) as f:
            label_header = f.readlines()[0].strip().split(",")

    train_data_files = args['train_data_files']
    train_image_files = args['train_image_files']
    train_text_files = args['train_text_files']
    train_label_files = args['train_label_files']

    valid_data_files = args['valid_data_files']
    valid_image_files = args['valid_image_files']
    valid_text_files = args['valid_text_files']
    valid_label_files = args['valid_label_files']

    key = args['key']
    seed = args['seed']

    user_size = args['user_size']
    text_size = args['text_size']
    image_size = args['image_size']
    dummy_user_vector = args['dummy_user_vector']
    shuffle = args['shuffle']
    batch_size = args['batch_size']
    num_workers = args['num_workers']

    micro_lambda = args['micro_lambda']
    macro_lambda = args['macro_lambda']
    max_epoch = args['epochs']
    log_dir = args['log_dir']
    checkpoint_file = args['checkpoint']
    verbosity = args['verbosity']
    save_frequency = args['save_frequency']

    if args['align_files']:
        train_data_files, train_image_files, train_text_files, train_label_files = get_overlapping_data_files(
            train_data_files, train_image_files, train_text_files,
            train_label_files)

    if args['align_files']:
        valid_data_files, valid_image_files, valid_text_files, valid_label_files = get_overlapping_data_files(
            valid_data_files, valid_image_files, valid_text_files,
            valid_label_files)

    device = torch.device("cpu" if (
        args['no_cuda'] or not torch.cuda.is_available()) else "cuda")

    if args['user_only'] and args['content_only']:
        raise NotImplementedError("What does user_only and content_only mean?")
    elif args['user_only']:
        from src.models.user_model import UserModel
        model = UserModel(user_size=args['user_size'],
                          hidden_size=args['hidden_size'],
                          joint_embedding_size=args['joint_embedding_size'])
    elif args['content_only']:
        from src.models.content_model import ContentModel
        model = ContentModel(image_embed_size=args['image_size'],
                             text_embed_size=args['text_size'],
                             hidden_size=args['hidden_size'],
                             joint_embedding_size=args['joint_embedding_size'])

    else:
        from src.models.feature_model import FeatureModel
        model = FeatureModel(user_size=args['user_size'],
                             image_embed_size=args['image_size'],
                             text_embed_size=args['text_size'],
                             hidden_size=args['hidden_size'],
                             joint_embedding_size=args['joint_embedding_size'])

    if (torch.cuda.device_count() > 1) and args['all_gpu']:
        print("Using %d GPUS!" % torch.cuda.device_count())
        model = nn.DataParallel(model)

    model = model.to(device)

    print(model)

    optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        device=device,
        key=key,
        data_header=data_header,
        label_header=label_header,
        user_size=user_size,
        text_size=text_size,
        image_size=image_size,
        dummy_user_vector=dummy_user_vector,
        user_only=args['user_only'],
        content_only=args['content_only'],
        seed=seed,
        micro_lambda=micro_lambda,
        macro_lambda=macro_lambda,
        max_epoch=max_epoch,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        log_dir=log_dir,
        checkpoint_file=checkpoint_file,
        verbosity=verbosity,
        save_frequency=save_frequency,
    )

    trainer.train(train_data_files, train_image_files, train_text_files,
                  train_label_files, valid_data_files, valid_image_files,
                  valid_text_files, valid_label_files)