예제 #1
0
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

######################################################
#
# Data preprocessing
#
######################################################

datasets = read_dict(args.input_dataset_conf)
print("{} {}".format(type(datasets), datasets))

# :: Needed for simulating the low resource scenarios
if args.nb_sentence is not None:
    datasets[list(datasets.keys())[0]]['nb_sentence'] = args.nb_sentence

prepare_training_data(datasets)

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath,
                            datasets,
                            reducePretrainedEmbeddings=True)
         'evaluate': True,
         'commentSymbol': None},
    'conll2000_chunking':
        {'columns': {0:'tokens', 2:'chunk_BIO'},
         'label': 'chunk_BIO',
         'evaluate': True,
         'commentSymbol': None},
}
'''

######################################################
#
# Data preprocessing
#
######################################################
datasets = read_dict(args.input_dataset_conf)
print("DATASET CONF {} {}".format(type(datasets), datasets))
target_task = get_target_task(datasets)
print("TARGET TASK {} {}".format(type(target_task), target_task))
aux_task = get_auxiliary_task(datasets)
print("AUX TASK {} {}".format(type(aux_task), aux_task))

prepare_training_data(datasets)

embeddingsPath = 'komninos_english_embeddings.gz'  #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
aux_task = []
target_task = [args.target_task]
if args.strategy == "most_similar":
    word2idx = build_vocab_from_domains(TASKS)
    domain2term = build_all_domain_term_dist(TASKS, word2idx)
    most_similar_domain, score = get_most_similar_domain(
        args.target_task, TASKS, domain2term)
    print("Most similar domain is : {}".format(most_similar_domain))
    aux_task.append(most_similar_domain)
elif args.strategy == "all":
    aux_task = list(set(TASKS) - set([args.target_task]))
    print(aux_task)

# Some network hyperparameters
params = read_dict(args.param_conf)
print("{} {}".format(type(params), params))

if args.ner == 1:
    if args.ner_name is None:
        aux_task = aux_task + NERS
        if args.different_level == 1:
            custom_classifier = {}  # Assuming NER always on the bottom
            custom_classifier[target_task[0]] = [('LSTM', 100), 'CRF']
            for task in aux_task:
                if task in NERS:
                    custom_classifier[task] = ['CRF']
                else:
                    custom_classifier[task] = [('LSTM', 100), 'CRF']

            params = {