def forward(self,Y,h,c, outEncoder,teacher_force):# Y это кол-во символов умножить на 256 if (np.random.rand()>teacher_force): seq_len=Y.shape[0]-1 output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48))) Y = self.embedding(Y) for i in range(len(Y)-1): # -1 так как sos не учитывем в criterion h[0],c[0] = self.lstm1(Y[i],(h[0].clone(),c[0].clone())) h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone())) h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone())) h2=h[2].clone() context = self.attention(h2, outEncoder,BATCH_SIZE) context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) ) # print("context",context.shape) # torch sueeze output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 )) else: seq_len=Y.shape[0]-1 output_decoder= load_to_cuda(torch.autograd.Variable(torch.zeros(seq_len, h.shape[1], 48))) alphabet = Alphabet() Y_cur = self.embedding( load_to_cuda(Variable(torch.LongTensor([alphabet.ch2index('<sos>')]))) ).view(1,self.hidden_size) for i in range(seq_len-1): Y_cur=Y_cur.expand(BATCH_SIZE,self.hidden_size) h[0],c[0] = self.lstm1(Y_cur,(h[0].clone(),c[0].clone())) h[1],c[1] = self.lstm2(h[0].clone(),(h[1].clone(),c[1].clone())) h[2],c[2] = self.lstm3(h[1].clone(),(h[2].clone(),c[2].clone())) h2 = h[2].clone() context = self.attention(h2, outEncoder,BATCH_SIZE) context = torch.bmm( context,outEncoder.view(outEncoder.shape[1],outEncoder.shape[0],-1) ) output_decoder[i] = self.MLP(torch.cat( (h2,torch.squeeze(context,1)) ,1 )) argmax = torch.max(output_decoder[i][0],dim=0) Y_cur=self.embedding( Variable(load_to_cuda(torch.LongTensor([argmax[1][0].data[0]]))) ).view(1,self.hidden_size) return output_decoder
def __init__(self): self.categories = Categories() self.categories.load() self.alphabet = Alphabet() self.alphabet.load() self.responses = [] self.nextRound()
class Round(): def __init__(self): self.categories = Categories() self.categories.load() self.alphabet = Alphabet() self.alphabet.load() self.responses = [] self.nextRound() def allResponses(self): return [d['response'] for d in self.responses] def getResponse(self, ptn): log( 'getResponse for ' + ptn ) try: pr = [d for d in self.responses if d['tn'] == ptn] return pr[0] except Exception as e: return { 'tn': ptn, 'valid': False, 'response': 'UNK' } def nextRound(self): self.cat_index = randint( 0, len(self.categories.data)-1) log( self.cat_index) self.alpha_index = randint( 0, len(self.alphabet.data)-1) log( self.alpha_index ) self.responses = [] def describe(self): alpha = self.alphabet.data[self.alpha_index] return self.categories.data[self.cat_index]['category'] + " that " + alpha['position'].lower() + " " + alpha['letter']
def __init__(self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@"): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return
def __init__(self): self.states = State() self.sigma = Alphabet() self.delta = list() self.delta_nfa = list() self.initial_state = None self.final_state = list()
def __init__(self): super(VsmNormer, self).__init__() self.word_alphabet = Alphabet('word') self.embedding_dim = None self.word_embedding = None self.dict_alphabet = Alphabet('dict') self.dict_embedding = None self.gpu = opt.gpu
def test_cross_off_adds_guessed_letter_to_list_of_guessed_letters(self): # arrange alphabet = Alphabet() letter = "a" # act alphabet.cross_off(letter) # assert assert letter in alphabet.guessed_letters
def test_already_guessed_returns_true_if_letter_guessed(self): # arrange alphabet = Alphabet() letter = "h" alphabet.cross_off(letter) # act result = alphabet.already_guessed("h") # assert assert result is True
def __init__(self, args): self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) if args.dataset not in self.config['data_list']: raise KeyError("No such dataset named {}.".format(args.dataset)) self.config['dataset'] = args.dataset self.datatype = 'binary' if self.config['dataset'] in self.config['datatype']['train_test']: self.datatype = 'train_test' self.alphabet = Alphabet('word') self.set_seed()
def get_word(seq): # seq-числа #print(seq) alphabet=Alphabet() s="" if len(seq)==0: return s for el in seq: #print("el:",el.data) s+=alphabet.index2ch(el) return s
def train(self, bitextGen): self.frenchAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='French Alphabet') for word in frSent) self.englishAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='English Alphabet') for word in enSent) self.frenchAlphabet.freeze() self.englishAlphabet.freeze() vF = len(self.frenchAlphabet) vE = len(self.englishAlphabet) tOfEGivenF = np.ones((vE, vF)) / vF aOfIJGivenLenELenF = AlignmentDict() for ep in tqdm(range(self.epochs), desc='Epoch'): countOfEGivenF = np.zeros((vE, vF)) totalOfF = np.zeros(vF) countOfIGivenJ = AlignmentDict() totalOfJ = CountDict() for frSent, enSent in bitextGen('Training'): # Compute Normalization stuff lenF = len(frSent) frMask = self.frenchAlphabet.map(frSent) lenE = len(enSent) enMask = self.englishAlphabet.map(enSent) aOfIJ = aOfIJGivenLenELenF[lenE, lenF] # total probability of each english word being translated from the french ones # has size of {len(enSent) x 1} sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ, axis=1, keepdims=True) # calculate counts delta = tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ / sTotalOfE deltaSummedOverE = np.sum(delta, axis=0) countOfEGivenF[np.ix_(enMask, frMask)] += delta totalOfF[frMask] += deltaSummedOverE countOfIGivenJ[lenE, lenF] += delta totalOfJ[lenE, lenF] += deltaSummedOverE # estimate probabilities tOfEGivenF = countOfEGivenF / totalOfF for lenE, lenF in aOfIJGivenLenELenF: aOfIJGivenLenELenF[ lenE, lenF] = countOfIGivenJ[lenE, lenF] / totalOfJ[lenE, lenF] self.tOfEGivenF = tOfEGivenF self.aOfIJGivenLenELenF = aOfIJGivenLenELenF
def map_string_2_id_open(string_list, name): string_id_list = [] alphabet_string = Alphabet(name) for strings in string_list: ids = [] for string in strings: id = alphabet_string.get_index(string) ids.append(id) string_id_list.append(ids) alphabet_string.close() return string_id_list, alphabet_string
def _alphabet_from_rdata(rdata, void_label, dummy_label): """Extract alphabet (of observations and labels) from given raw data.""" alphabet = Alphabet(void_label=void_label, dummy_label=dummy_label) for (sent, labels) in rdata: for word in sent: for x in chain(*word): alphabet.add_observation(x) for y in labels: alphabet.add_label(y) return alphabet
def time_stamp_calc(self): time = floor(self.creation_time) # choosing a seed value to compare, in this case, the date im writing this code seed = 10012019 alpha = Alphabet() alpha.shuffle() index = (time % seed) # digit_one = alpha[temp_time % seed] # alpha = alpha.shuffle() # digit_two = alpha[temp_time % seed] return index
def load_config_pos(config_path, char_embedd_dim): max_sent_length, max_char_length, num_labels, embedd_dim_concat = load_config(config_path) alphabet_char = Alphabet('char', keep_growing=False) alphabet_char.load(config_path, 'alphabet_char') alphabet_label = Alphabet('label', keep_growing=False) alphabet_label.load(config_path, 'alphabet_label') scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [alphabet_char.size(), char_embedd_dim]).\ astype(theano.config.floatX) return max_sent_length, max_char_length, num_labels, embedd_dim_concat, alphabet_char, alphabet_label, \ char_embedd_table
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) A.add_many(f for token in x.sequence for f in token.attributes) # domains are now ready L.freeze() A.stop_growth() return (L, A)
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:', dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16, dev2016, devtest2016, test2016] for fname in files: fname_ext = os.path.join(ddir, fname) tid, topics, tweets, sentiments = load_data(fname_ext, topic_alphabet) print "Number of tweets:", len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets, topics, topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump( topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def __init__(self, config, alphabet: Alphabet, emb_dim, device): super(TextCNN, self).__init__() self.config = config self.embeddings = nn.Embedding(alphabet.size(), emb_dim) # self.embeddings.weight.requires_grad = False if config['train_mode'] == 'static': self.embeddings = self.embeddings.from_pretrained( torch.from_numpy(alphabet.pretrained_emb)) elif config['train_mode'] == 'fine-tuned': self.embeddings.weight.data.copy_( torch.from_numpy(alphabet.pretrained_emb)) filters = config['filters'] self.cnn = nn.ModuleList([ nn.Sequential( nn.Conv1d(1, config['output_channels'], [w, emb_dim]), nn.ReLU(), nn.AdaptiveMaxPool2d(1)) for w in filters ]) self.linear = nn.Linear(config['output_channels'] * len(filters), 2, bias=True) self.dropout = nn.Dropout(config['dropout']) self.relu = nn.ReLU() self.scale = np.sqrt(3.0 / emb_dim) self.apply(self._init_esim_weights)
def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() print items total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" print "feature_prefix:{}".format(feature_prefix) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm']
def initial_feature_alphabets(self): for l in open(self.train_dir, 'r').readlines(): if not l.startswith("#") and not l.startswith("-BOS-"): items = l.strip("\n").split() break total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [self.HP_feature_default_size ] * self.feature_num #self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm']
def __init__(self): self.name2id = {} # preferred name -> id self.id2name = {} # id -> CTD_Term self.altid2id = {} # alternative id -> id if opt.method == 'cla': self.id_alphabet = Alphabet('id')
def test_given_alphabet_as_int_returns_error(self): test_data = 123456 try: Alphabet('Test', test_data) self.assertFalse(True, "Expected exception") except: return
def main(): UPPER_STRING = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" testAlphabet = Alphabet(UPPER_STRING) permutation1 = Permutation( "(AELTPHQXRU) (BKNW) (CMOY) (DFG) (IV) (JZ) (S)", testAlphabet) permutation2 = Permutation( "(FIXVYOMW) (CDKLHUP) (ESZ) (BJ) (GR) (NT) (A) (Q)", testAlphabet) permutation3 = Permutation("(ABDHPEJT) (CFLVMZOYQIRWUKXSG) (N)", testAlphabet) permutation4 = Permutation("(AEPLIYWCOXMRFZBSTGJQNH) (DV) (KU)", testAlphabet) permutation5 = Permutation( "(AE) (BN) (CK) (DQ) (FU) (GY) (HW) (IJ) (LO) (MP) (RX) (SZ) (TV)", testAlphabet) rotor1 = Rotor("I", permutation1, "TG") rotor2 = Rotor("II", permutation2, "A") rotor3 = Rotor("III", permutation3, "B") rotor4 = Rotor("IV", permutation4, "XO") reflector = Reflector("A", permutation5) rotors = [reflector, rotor4, rotor3, rotor2, rotor1] machine = Machine(testAlphabet, 5, 6, rotors) machine.insertRotors(["A", "IV", "III", "II", "I"]) machine.setRotors("AAAA") message = input("What to convert:") print(machine.convertMsg(message))
def __init__(self, frame_dir): self.frame_dir = frame_dir self.alphabet = Alphabet() # self.words = [name for name in os.listdir(FRAME_DIR)] # для сквозного прохода по папкам с видео self.words = [] for root, dirs, files in os.walk(self.frame_dir): if not dirs: self.words.append(root) # print('root: ', root) # print('dirs: ', dirs) # print('files: ', files) # print(self.words) self.count = 0
def __init__(self, opt): self.train_data = None self.dev_data = None self.test_data = None self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) self.train_texts = None self.train_Ids = None self.dev_texts = None self.dev_Ids = None self.test_texts = None self.test_Ids = None self.pretrain_word_embedding = None self.word_emb_dim = opt.word_emb_dim self.config = self.read_config(opt.config) self.feat_config = None the_item = 'ner_feature' if the_item in self.config: self.feat_config = self.config[the_item] ## [POS]:{emb_size:20} self.feature_alphabets = [] self.feature_emb_dims = [] for k, v in self.feat_config.items(): self.feature_alphabets.append(Alphabet(k)) self.feature_emb_dims.append(int(v['emb_size']))
def decode_all(manifests): data_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1, keep_transcription_text=True) ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # decoders only accept string encoded in utf-8 alphabet = Alphabet(args.vocab_path) ds2_model.logger.info("start decoding with extended output...") ds2_model.init_ext_scorer(args.alpha, args.beta, args.lang_model_path, args.trie_path, alphabet) for audioname, manifest_path, duration, offset in manifests: try: duration_f = float(duration) if duration_f < 1.: yield (audioname, manifest_path, None, duration, offset) continue except (TypeError, ValueError): pass batch_reader = data_generator.batch_reader_creator( manifest_path=manifest_path, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, shuffle_method=None) for decode_data in batch_reader(): probs_split = ds2_model.infer_batch_probs( infer_data=decode_data, feeding_dict=data_generator.feeding) # note: we only perform single file decoding result_transcript = ds2_model.decode_beam_search( probs_split=probs_split, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, cutoff_top_n=args.cutoff_top_n, alphabet=alphabet) yield (audioname, manifest_path, result_transcript, duration, offset)
def __data_generation(self, list_IDs_temp, indexes): 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) # Initialization X = [] # Generate data for i, ID in enumerate(list_IDs_temp): X.append(self.load_audio(join(self.path_prefix, ID))) X = self.pad_and_transpose(X) X = self.align(X) alphabet = Alphabet("alphabet.txt") #lyrics = alphabet.get_batch_labels(self.fetch_lyrics(self.labels)) lyrics = alphabet.get_batch_labels(self.labels[indexes]) #lyrics = self.align(lyrics) return (X, lyrics)
def __init__(self, input_file): self.original_data = open(input_file, 'r').readlines() self.index_data = [] self.word_alphabet = Alphabet('word') self.gloss_alphabet = Alphabet('gloss') self.entity_alphabet = Alphabet('entity') self.gaz_alphabet = Alphabet('gaz') self.label_alphabet = Alphabet('label') self.word_alphabet_size = 0 self.gloss_alphabet_size = 0 self.entity_alphabet_size = 0 self.gaz_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_gaz_hidden_dim = 50 self.HP_lstm_hidden_dim = 200 self.HP_dropout = 0.5 self.gaz_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = False self.HP_use_entity = False self.HP_use_gloss = True self.HP_use_gaz = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_iteration = 100 # embedding hyperparameter self.word_emb_dim = 200 self.entity_emb_dim = 50 self.gloss_features = "CNN" #["CNN","LSTM"] self.gloss_emb_dim = 200 self.gloss_hidden_dim = 300 self.pretrain_word_embedding = np.array([]) self.pretrain_gaz_embedding = None self.word_embed_path = "../LOVECC/NYM.6B.200d.txt" #"NYM_200.txt" self.gaz_embed_path = None self.gaz_emb_dim = 200 self.HP_fix_gaz_emb = True
def test_str_representation_does_not_show_hidden_letters(self): # arrange alphabet = Alphabet() word = Word(alphabet) word.word_to_guess = "aardvark" word.guess_letter("a") # act hidden_word = str(word) # assert assert hidden_word == "aa___a__"
def make_alphabet(): alphabet = Alphabet(0) load_dataset("%s/%s.train.txt" % (data, dataset), alphabet) load_dataset("%s/%s.valid.txt" % (data, dataset), alphabet) if dataset == 'ptb': load_dataset("%s/%s.test.txt" % (data, dataset), alphabet) # add all the words in all three dataset print("%s: total %d words" % (dataset, len(alphabet))) pickle.dump(alphabet, open("%s/alphabet.pkl" % data, "wb"))
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:',dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16,dev2016,devtest2016,test2016] for fname in files: fname_ext = os.path.join(ddir,fname) tid,topics,tweets, sentiments = load_data(fname_ext,topic_alphabet) print "Number of tweets:",len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets,topics,topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump(topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def __init__(self, item_path, sub_item_path, pair_path, split_c=','): self.__dict__.update(locals()) print('Loading title and category information...', time.ctime()) sub_item_set = set() for line in open(sub_item_path).readlines(): sub_item_set.add(line.split()[0]) self.item_title = {} self.item_cat = {} self.cat2idx = {} self.max_len = 0 sentence_list = [] for line in open(item_path).readlines(): tmp = line.split() item = tmp[0] cat = tmp[1] if cat not in self.cat2idx: self.cat2idx[cat] = len(self.cat2idx) title = tmp[2].split(split_c) self.item_title[item] = title self.item_cat[item] = self.cat2idx[cat] if item in sub_item_set: sentence_list.append(title) self.max_len = min(config.max_len, max(self.max_len, len(title))) print(('%s items' % len(sentence_list)), time.ctime()) print('Generating alphabet...', time.ctime()) self.alphabet = Alphabet() add_to_vocab(sentence_list, self.alphabet) print(('%s words' % len(self.alphabet)), time.ctime()) print('Generating weight from word2vec model...', time.ctime()) self.sentence_list = sentence_list w2v_model = word2vec(sentence_list) self.w2v_weight = np.zeros((len(self.alphabet), config.w2vSize)) for word, idx in self.alphabet.iteritems(): if word in w2v_model.vocab: self.w2v_weight[idx] = w2v_model[word] print('Loading pairs ...', time.ctime()) self.pair_list = open(pair_path).readlines()
def initial_feature_alphabets(self): feature_prefix = '[Cap]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 0 feature_prefix = '[POS]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 1 self.feature_num = len(self.feature_alphabets) self.feature_emb_dims = [20] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size']
def pad(s, destination_size=None): """Pad a string using different whitespace characters to stop Twitter from thinking two tweets are the same. Will try to add 10% whitespace to the string. """ if not destination_size: destination_size = min(len(s) + max(int(len(s)*0.1), 5), 140) padding = '' for i in range(len(s), destination_size): padding += Alphabet.random_whitespace() return s + padding
def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return
def main(): data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz' output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy' output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy' outdir = 'parsed_tweets' alphabet_words = Alphabet(start_feature_id=0) alphabet_words.add('UNKNOWN_WORD_IDX') alphabet_words.add('DUMMY_WORD_IDX') dummy_word_idx = DUMMY_WORD_IDX alphabet_hashtags = Alphabet(start_feature_id=0) alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX') inp = 'train' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) inp = 'test' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w')) cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 return C def build_char_embedd_table(): scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes(sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes(sentences_test) # close character alphabet char_alphabet.close() logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train = construct_tensor_char(index_sentences_train) C_dev = construct_tensor_char(index_sentences_dev) C_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table()
def load_dataset_sequence_labeling(train_path, dev_path, test_path, word_column=1, label_column=4, label_name='pos', oov='embedding', fine_tune=False, embedding="word2Vec", embedding_path=None, use_character=False): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param label_column: the column of label (start from 0) :param label_name: name of label, such as pos or ner :param oov: embedding for oov word, choose from ['random', 'embedding']. If "embedding", then add words in dev and test data to alphabet; if "random", not. :param fine_tune: if fine tune word embeddings. :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :param use_character: if use character embeddings. :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table (if fine tune), label_alphabet, C_train, C_dev, C_test, char_embedd_table """ def get_max_length(word_sentences): max_len = 0 for sentence in word_sentences: length = len(sentence) if length > max_len: max_len = length return max_len def construct_tensor_fine_tune(word_index_sentences, label_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Y = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] label_ids = label_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] label = label_ids[j] X[i, j] = wid Y[i, j] = label - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, Y, mask def build_embedd_table(embedd_dict, embedd_dim, caseless): scale = np.sqrt(3.0 / embedd_dim) embedd_table = np.empty([word_alphabet.size(), embedd_dim], dtype=theano.config.floatX) embedd_table[word_alphabet.default_index, :] = np.random.uniform(-scale, scale, [1, embedd_dim]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedd = embedd_dict[ww] if ww in embedd_dict else np.random.uniform(-scale, scale, [1, embedd_dim]) embedd_table[index, :] = embedd return embedd_table def generate_dataset_fine_tune(): """ generate data tensor when fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, embedd_table, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], Y.shape = [#data, max_length]) X_train, Y_train, mask_train = construct_tensor_fine_tune(word_index_sentences_train, label_index_sentences_train) X_dev, Y_dev, mask_dev = construct_tensor_fine_tune(word_index_sentences_dev, label_index_sentences_dev) X_test, Y_test, mask_test = construct_tensor_fine_tune(word_index_sentences_test, label_index_sentences_test) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ build_embedd_table(embedd_dict, embedd_dim, caseless), label_alphabet, \ C_train, C_dev, C_test, char_embedd_table def construct_tensor_not_fine_tune(word_sentences, label_index_sentences, unknown_embedd, embedd_dict, embedd_dim, caseless): X = np.empty([len(word_sentences), max_length, embedd_dim], dtype=theano.config.floatX) Y = np.empty([len(word_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_sentences), max_length], dtype=theano.config.floatX) # bad_dict = dict() # bad_num = 0 for i in range(len(word_sentences)): words = word_sentences[i] label_ids = label_index_sentences[i] length = len(words) for j in range(length): word = words[j].lower() if caseless else words[j] label = label_ids[j] embedd = embedd_dict[word] if word in embedd_dict else unknown_embedd X[i, j, :] = embedd Y[i, j] = label - 1 # if word not in embedd_dict: # bad_num += 1 # if word in bad_dict: # bad_dict[word] += 1 # else: # bad_dict[word] = 1 # Zero out X after the end of the sequence X[i, length:] = np.zeros([1, embedd_dim], dtype=theano.config.floatX) # Copy the last label after the end of the sequence Y[i, length:] = Y[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 # for w, c in bad_dict.items(): # if c >= 100: # print "%s: %d" % (w, c) # print bad_num return X, Y, mask def generate_dataset_not_fine_tune(): """ generate data tensor when not fine tuning :return: X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, None, label_size """ embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %s" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length, embedding_dim], Y.shape = [#data, max_length]) unknown_embedd = np.random.uniform(-0.01, 0.01, [1, embedd_dim]) X_train, Y_train, mask_train = construct_tensor_not_fine_tune(word_sentences_train, label_index_sentences_train, unknown_embedd, embedd_dict, embedd_dim, caseless) X_dev, Y_dev, mask_dev = construct_tensor_not_fine_tune(word_sentences_dev, label_index_sentences_dev, unknown_embedd, embedd_dict, embedd_dim, caseless) X_test, Y_test, mask_test = construct_tensor_not_fine_tune(word_sentences_test, label_index_sentences_test, unknown_embedd, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) if use_character else ( None, None, None, None) return X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ None, label_alphabet, C_train, C_dev, C_test, char_embedd_table word_alphabet = Alphabet('word') label_alphabet = Alphabet(label_name) # read training data logger.info("Reading data from training set...") word_sentences_train, _, word_index_sentences_train, label_index_sentences_train = read_conll_sequence_labeling( train_path, word_alphabet, label_alphabet, word_column, label_column) # if oov is "random" and do not fine tune, close word_alphabet if oov == "random" and not fine_tune: logger.info("Close word alphabet.") word_alphabet.close() # read dev data logger.info("Reading data from dev set...") word_sentences_dev, _, word_index_sentences_dev, label_index_sentences_dev = read_conll_sequence_labeling( dev_path, word_alphabet, label_alphabet, word_column, label_column) # read test data logger.info("Reading data from test set...") word_sentences_test, _, word_index_sentences_test, label_index_sentences_test = read_conll_sequence_labeling( test_path, word_alphabet, label_alphabet, word_column, label_column) # close alphabets word_alphabet.close() label_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("label alphabet size: %d" % (label_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) if fine_tune: logger.info("Generating data with fine tuning...") return generate_dataset_fine_tune() else: logger.info("Generating data without fine tuning...") return generate_dataset_not_fine_tune()
class BinarySource( Source ): """ Source for binary classification data in following format: one example per line with feature-value pair separated by separator symbol (' ' by default). E.g.: 1 f1:1.0 f2:1.0 f3:1.0 -1 f2:1.0 f3:1.0 f8:1.0 -1 f1:1.0 f2:1.0 1 f8:1.0 f9:1.0 f10:1.0 """ def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return def _parse( self ): """ return parsed line """ sep = self._sep for line in self._stream: line = line.rstrip() items = line.split() cl = items[0] assert cl in [POS_LAB, NEG_LAB] feats = [] if self._bias: feats.append( (self._bias_prefix, 1.0) ) # implicit bias for s in items[1:]: try: f,v = s.rsplit(sep, 1) v = float(v) feats.append( (f,v) ) except ValueError: sys.exit("Datasource error: make sure you use the right datasource format.") yield ( cl, feats ) def _populate_alphabet( self ): print >> sys.stderr, "Populating feature alphabet... ", self.unlock_alphabet() if self._stream_type == "generator": for i, gen_inst in enumerate(self._stream): # read stream directly sys.stderr.write("%s" %"\b"*len(str(i))+str(i)) featvals = gen_inst.get_featvals() for (f,_) in featvals: self._feature_alphabet.add(f) else: try: for tag,feats in self._parse(): for f,_ in feats: self._feature_alphabet.add( f ) except ValueError: sys.exit("Datasource error: make sure you use the right data format.") # rewind stream try: self.rewind() except TypeError: sys.exit("TypeError: make sure rewind() is used only on files.") print >> sys.stderr, " done." print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size() return def unlock_alphabet( self ): self._feature_alphabet.unlock() return def lock_alphabet( self ): self._feature_alphabet.lock() return def set_alphabet( self, feature_alphabet ): self._feature_alphabet = feature_alphabet return def get_alphabet( self ): return self._feature_alphabet def get_input( self ): for label,feats in self._parse(): yield label, feats def __iter__( self ): """ instance generator """ feature_alphabet = self._feature_alphabet assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!" if self._stream_type in ["file","list"]: for idx,(label,feats) in enumerate(self._parse()): if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in feats: feature_alphabet.add(f) instance = self._Instance(idx, label, feats, feature_alphabet) yield instance elif self._stream_type == "generator": for idx, gen_inst in enumerate(self._stream): # read stream directly featvals = gen_inst.get_featvals() label = gen_inst.get_label() if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in featvals: feature_alphabet.add(f) instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet) yield instance def size( self ): s = len(list(self._stream)) self.rewind() return s
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) # add labels to label domain # extract features of the target path F = x.F path = x.truth A.add_many(F(0, None, path[0])) A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t])) # domains are now ready L.freeze() A.stop_growth() return (L, A)
punct = set(string.punctuation) #stoplist.update(punct) # merge inputs to compute word frequencies _, ext = os.path.splitext(os.path.basename(train)) all_fname = "/tmp/trec-merged" + ext files = ' '.join([train, dev, test]) subprocess.call("/bin/cat {} > {}".format(files, all_fname), shell=True) unique_questions, qids, questions, answers, labels = load_data(all_fname, resample = False) docs = answers + unique_questions word2dfs = compute_dfs(docs) print word2dfs.items()[:10] # map words to ids alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') add_to_vocab(answers, alphabet) add_to_vocab(questions, alphabet) basename = os.path.basename(train) cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w')) print "alphabet size=", len(alphabet) # dump embedding file dummy_word_idx = alphabet.fid dump_embedding(outdir, 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin', alphabet) # summarize max sentense length q_max_sent_length = max(map(lambda x: len(x), questions)) a_max_sent_length = max(map(lambda x: len(x), answers)) print 'q_max_sent_length', q_max_sent_length
def load_dataset_parsing(train_path, dev_path, test_path, word_column=1, pos_column=4, head_column=6, type_column=7, embedding="word2Vec", embedding_path=None): """ load data from file :param train_path: path of training file :param dev_path: path of dev file :param test_path: path of test file :param word_column: the column index of word (start from 0) :param pos_column: the column index of pos (start from 0) :param head_column: the column index of head (start from 0) :param type_column: the column index of types (start from 0) :param embedding: embeddings for words, choose from ['word2vec', 'senna']. :param embedding_path: path of file storing word embeddings. :return: X_train, POS_train, Head_train, Type_train, mask_train, X_dev, POS_dev, Head_dev, Type_dev, mask_dev, X_test, POS_test, Head_test, Type_test, mask_test, embedd_table, word_alphabet, pos_alphabet, type_alphabet, C_train, C_dev, C_test, char_embedd_table """ def construct_tensor(word_index_sentences, pos_index_sentences, head_sentences, type_index_sentences): X = np.empty([len(word_index_sentences), max_length], dtype=np.int32) POS = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Head = np.empty([len(word_index_sentences), max_length], dtype=np.int32) Type = np.empty([len(word_index_sentences), max_length], dtype=np.int32) mask = np.zeros([len(word_index_sentences), max_length], dtype=theano.config.floatX) for i in range(len(word_index_sentences)): word_ids = word_index_sentences[i] pos_ids = pos_index_sentences[i] heads = head_sentences[i] type_ids = type_index_sentences[i] length = len(word_ids) for j in range(length): wid = word_ids[j] pid = pos_ids[j] head = heads[j] tid = type_ids[j] X[i, j] = wid POS[i, j] = pid - 1 Head[i, j] = head Type[i, j] = tid - 1 # Zero out X after the end of the sequence X[i, length:] = 0 # Copy the last label after the end of the sequence POS[i, length:] = POS[i, length - 1] Head[i, length:] = Head[i, length - 1] Type[i, length:] = Type[i, length - 1] # Make the mask for this sample 1 within the range of length mask[i, :length] = 1 return X, POS, Head, Type, mask word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') # read training data logger.info("Reading data from training set...") word_sentences_train, pos_sentences_train, head_sentences_train, type_sentence_train, \ word_index_sentences_train, pos_index_sentences_train, \ type_index_sentences_train = read_conll_parsing(train_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read dev data logger.info("Reading data from dev set...") word_sentences_dev, pos_sentences_dev, head_sentences_dev, type_sentence_dev, \ word_index_sentences_dev, pos_index_sentences_dev, \ type_index_sentences_dev = read_conll_parsing(dev_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # read test data logger.info("Reading data from test set...") word_sentences_test, pos_sentences_test, head_sentences_test, type_sentence_test, \ word_index_sentences_test, pos_index_sentences_test, \ type_index_sentences_test = read_conll_parsing(test_path, word_alphabet, pos_alphabet, type_alphabet, word_column, pos_column, head_column, type_column) # close alphabets word_alphabet.close() pos_alphabet.close() type_alphabet.close() logger.info("word alphabet size: %d" % (word_alphabet.size() - 1)) logger.info("pos alphabet size: %d" % (pos_alphabet.size() - 1)) logger.info("type alphabet size: %d" % (type_alphabet.size() - 1)) # get maximum length max_length_train = get_max_length(word_sentences_train) max_length_dev = get_max_length(word_sentences_dev) max_length_test = get_max_length(word_sentences_test) max_length = min(MAX_LENGTH, max(max_length_train, max_length_dev, max_length_test)) logger.info("Maximum length of training set is %d" % max_length_train) logger.info("Maximum length of dev set is %d" % max_length_dev) logger.info("Maximum length of test set is %d" % max_length_test) logger.info("Maximum length used for training is %d" % max_length) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, word_alphabet, logger) logger.info("Dimension of embedding is %d, Caseless: %d" % (embedd_dim, caseless)) # fill data tensor (X.shape = [#data, max_length], {POS, Head, Type}.shape = [#data, max_length]) X_train, POS_train, Head_train, Type_train, mask_train = construct_tensor(word_index_sentences_train, pos_index_sentences_train, head_sentences_train, type_index_sentences_train) X_dev, POS_dev, Head_dev, Type_dev, mask_dev = construct_tensor(word_index_sentences_dev, pos_index_sentences_dev, head_sentences_dev, type_index_sentences_dev) X_test, POS_test, Head_test, Type_test, mask_test = construct_tensor(word_index_sentences_test, pos_index_sentences_test, head_sentences_test, type_index_sentences_test) embedd_table = build_embedd_table(word_alphabet, embedd_dict, embedd_dim, caseless) C_train, C_dev, C_test, char_embedd_table = generate_character_data(word_sentences_train, word_sentences_dev, word_sentences_test, max_length) return X_train, POS_train, Head_train, Type_train, mask_train, \ X_dev, POS_dev, Head_dev, Type_dev, mask_dev, \ X_test, POS_test, Head_test, Type_test, mask_test, \ embedd_table, word_alphabet, pos_alphabet, type_alphabet, \ C_train, C_dev, C_test, char_embedd_table