Пример #1
0
def main():
    """ Load data, train and evaluate a model """
    args = get_args()
    if args.classifier_type in ['cnn', 'ffn']:  # PyTorch
        run_pkg = 'pytorch'
    else:
        run_pkg = 'sklearn'
    if not args.text_emb_type:
        emb_type_name = args.post_emb_type
    else:
        emb_type_name = args.text_emb_type
    name_parts = [
        args.model_name,
        args.features.replace(',', '+'), emb_type_name, args.classifier_type
    ]
    name_parts = [n for n in name_parts if n is not None]
    exp_name = '_'.join(name_parts).strip('_')
    exp_output_dirpath = os.path.join(args.output_dirpath, exp_name)
    model_outpath = f'/projects/tumblr_community_identity/models/{exp_name}.pkl'

    # Load trained embedding models
    # TODO: Move the loading of things elsewhere (load_embeddings)
    if (args.text_emb_type == 'unigrams' or args.text_emb_type is None) and \
        (args.post_emb_type == 'unigrams' or args.post_emb_type == 'tags' or \
        args.post_emb_type is None):
        word_embs = None
        graph_embs = None
        sent_embs = None
    else:
        print("Loading embeddings...")
        emb_loader = EmbeddingLoader(args.post_emb_type, args.text_emb_type)
        load_word_embs, load_graph_embs, graph_embs = False, False, None
        load_sent_embs, sent_embs = False, None
        if args.post_emb_type != 'unigrams':
            load_word_embs = True
        if 'graph' in args.features:
            load_graph_embs = True
        if 'text' in args.features and args.text_emb_type in [
                'fasttext', 'bert'
        ]:
            load_sent_embs = True
        emb_loader.load(word_embs=load_word_embs,
                        graph_embs=load_graph_embs,
                        sent_embs=load_sent_embs)
        if load_graph_embs:
            graph_embs = emb_loader.graph_embs
        if load_sent_embs:
            sent_embs = emb_loader.sent_embs
        word_embs = emb_loader.word_embs

    # Load and filter dataset
    print("Loading and filtering data...")
    dataset = Dataset()
    dataset.load(args.data_location, args.task)
    if args.load_preprocessed:
        id2token = load_pickle(args.load_preprocessed)
        user_filter = set(list(id2token.keys()))
        dataset.filter(user_ids=user_filter,
                       word_filter=word_embs.wv,
                       word_filter_min=args.word_filter_min,
                       preprocessed_descs=id2token)
    else:
        if args.text_emb_type and args.post_emb_type and \
            args.text_emb_type != 'unigrams' and args.post_emb_type != 'unigrams':
            dataset.filter(word_filter=word_embs.wv,
                           word_filter_min=args.word_filter_min)
        elif 'comms' in args.features:
            dataset.load_filter_communities()

    # Extract features
    print("Extracting features...")
    post_ngrams, post_tags, text_ngrams = False, False, False
    if run_pkg == 'pytorch':
        extractor = FeatureExtractor(args.features,
                                     word_embs=word_embs,
                                     graph_embs=graph_embs,
                                     sent_embs=sent_embs,
                                     word_inds=True,
                                     padding_size=30)
        dataset = extractor.extract(dataset, run_pkg, dev=True)
    else:
        if args.post_emb_type == 'unigrams':
            post_ngrams = True
        elif args.post_emb_type == 'tags':
            post_tags = True
        if args.text_emb_type == 'unigrams':
            text_ngrams = True
        extractor = FeatureExtractor(args.features,
                                     word_embs=word_embs,
                                     graph_embs=graph_embs,
                                     sent_embs=sent_embs,
                                     post_ngrams=post_ngrams,
                                     post_tags=post_tags,
                                     text_ngrams=text_ngrams,
                                     select_k=args.feature_selection_k,
                                     post_tag_pca=args.post_tag_pca,
                                     post_tag_lda=args.post_tag_lda)
        dataset = extractor.extract(dataset, run_pkg, dev=True)

    # Run model
    print("Running model...")
    data_outpath = f'../tmp/{exp_name}_data.pkl'
    dataset.save(data_outpath)
    print(f"\tSaved dataset folds to {data_outpath}")
    experiment = Experiment(extractor,
                            dataset,
                            args.classifier_type,
                            args.use_cuda,
                            args.epochs,
                            sfs_k=args.forward_feature_selection_k)
    experiment.run()

    # Print output
    if experiment.dev_score:
        print(f'\tDev set score: {experiment.dev_score: .4f}')
    print(f'\tTest set score: {experiment.test_score: .4f}')

    # Save settings, output
    if run_pkg == 'sklearn':
        dataset.save_settings(exp_output_dirpath)
        experiment.save_output(exp_output_dirpath)
        experiment.save_model(model_outpath)