logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) ###################################################### # # Data preprocessing # ###################################################### datasets = read_dict(args.input_dataset_conf) print("{} {}".format(type(datasets), datasets)) # :: Needed for simulating the low resource scenarios if args.nb_sentence is not None: datasets[list(datasets.keys())[0]]['nb_sentence'] = args.nb_sentence prepare_training_data(datasets) # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = 'komninos_english_embeddings.gz' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets, reducePretrainedEmbeddings=True)
'evaluate': True, 'commentSymbol': None}, 'conll2000_chunking': {'columns': {0:'tokens', 2:'chunk_BIO'}, 'label': 'chunk_BIO', 'evaluate': True, 'commentSymbol': None}, } ''' ###################################################### # # Data preprocessing # ###################################################### datasets = read_dict(args.input_dataset_conf) print("DATASET CONF {} {}".format(type(datasets), datasets)) target_task = get_target_task(datasets) print("TARGET TASK {} {}".format(type(target_task), target_task)) aux_task = get_auxiliary_task(datasets) print("AUX TASK {} {}".format(type(aux_task), aux_task)) prepare_training_data(datasets) embeddingsPath = 'komninos_english_embeddings.gz' #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/ # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### #
aux_task = [] target_task = [args.target_task] if args.strategy == "most_similar": word2idx = build_vocab_from_domains(TASKS) domain2term = build_all_domain_term_dist(TASKS, word2idx) most_similar_domain, score = get_most_similar_domain( args.target_task, TASKS, domain2term) print("Most similar domain is : {}".format(most_similar_domain)) aux_task.append(most_similar_domain) elif args.strategy == "all": aux_task = list(set(TASKS) - set([args.target_task])) print(aux_task) # Some network hyperparameters params = read_dict(args.param_conf) print("{} {}".format(type(params), params)) if args.ner == 1: if args.ner_name is None: aux_task = aux_task + NERS if args.different_level == 1: custom_classifier = {} # Assuming NER always on the bottom custom_classifier[target_task[0]] = [('LSTM', 100), 'CRF'] for task in aux_task: if task in NERS: custom_classifier[task] = ['CRF'] else: custom_classifier[task] = [('LSTM', 100), 'CRF'] params = {