def test_char_tokenizer(input_path, output_path, longest_token=None): tokenizer = Tokenizer( nb_words=None, bos_str='-NULL-', eos_str=None, mode='chars', longest_token=longest_token, ) tokenizer.fit_one(smart_ropen(input_path)) seqs = tokenizer.to_sequences(smart_ropen(input_path), dtype='int32') with open(output_path, 'w') as fo: tokenizer.write_as_text(seqs, output_stream=fo) return tokenizer
def test(): path1 = '/home/wferrei1/github/dgm4nlp/data/en-fr/trial.en-fr.en' path2 = '/home/wferrei1/github/dgm4nlp/data/en-fr/trial.en-fr.fr' print('Fitting tokenizer') tok1 = nlputils.Tokenizer(nb_words=None, bos_str='-NULL-', eos_str=None, mode='chars', longest_token=10) tok1.fit_one(smart_ropen(path1)) print(tok1.vocab_size()) tok2 = nlputils.Tokenizer(nb_words=None, bos_str=None, eos_str=None, mode='chars', longest_token=12) tok2.fit_one(smart_ropen(path2)) print(tok2.vocab_size()) print('Memory mapping data') text = Multitext3D( [path1, path2], [tok1, tok2], shortest=[2, 2], longest=[20, 20], trim=[True, True], batch_dtype='int32', mask_dtype='bool', ) print(text.nb_samples(), text.longest_sequence(0), text.deepest_sequence(0)) print(text.nb_samples(), text.longest_sequence(1), text.deepest_sequence(1)) import sys for i, (xm, ym) in enumerate( text.batch_iterator(1, dynamic_sequence_length=True, dynamic_sequence_depth=True)): x, m1 = xm x, m1 = x[0], m1[0] y, m2 = ym y, m2 = y[0], m2[0] print(x) print(m1) print(y) print(m2) print() tok1.write_as_text([x], sys.stdout) tok2.write_as_text([y], sys.stdout) if i == 1: break
def construct_mmap(input_path, output_path, tokenizer, selection, nb_tokens, dtype): """ Construct memory map for selected sentences in a corpus. :param input_path: path to text :param output_path: path to memory map file :param tokenizer: tokenizer for text :param selection: array of binary selectors :param nb_tokens: total number of tokens in the selected corpus :param dtype: data type for memmap :return: np.array with shape (nb_samples,) where array[i] is the length of the ith sequence """ # construct memory mapped array mmap = np.memmap(output_path, dtype=dtype, mode='w+', shape=nb_tokens) # prepare for populating memmap offset = 0 sample_length = [] # populate memory map for sid, seq in enumerate(tokenizer.to_sequences_iterator(smart_ropen(input_path))): if not selection[sid]: # skip sentences that do not comply with length constraints continue # here we have a valid sequence, thus we memory map it mmap[offset:offset + seq.shape[0]] = seq offset += seq.shape[0] sample_length.append(seq.shape[0]) del mmap return np.array(sample_length, dtype='int64')
def bound_length(input_paths, tokenizers, shortest, longest): """ Return an np.array which flags whether all parallel segments comply with length constraints and count the number of tokens in each stream (considering valid sequences only). :param input_paths: paths (list/tuple) to each part of the parallel collection :param tokenizers: list/tuple of tokenizers :param shortest: shortest valid sequence for each part of the parallel collection :param longest: longest valid sequence for each part of the parallel collection :return: selection (nb_samples,) and counts (nb_streams,) """ # get an iterator for each stream nb_streams = len(input_paths) iterators = [tokenizers[i].to_sequences_iterator(smart_ropen(input_paths[i])) for i in range(nb_streams)] # length checks selection = [] nb_tokens = [0] * nb_streams for seqs in zip(*iterators): # get a sequence from each iterator # check if every sequence complies with its respective length bounds if not all(lower <= seq.shape[0] <= upper for lower, upper, seq in zip(shortest, longest, seqs)): selection.append(False) # excluded else: selection.append(True) # included # increase token count for i, seq in enumerate(seqs): nb_tokens[i] += seq.shape[0] return np.array(selection, dtype=bool), np.array(nb_tokens, dtype='int64')
def _load_cand(cand_file): candidates = dict() with smart_ropen(cand_file) as cf: for line in cf: (key, cand) = line.strip().split('::') #(word, _) = key.split('.') cols = cand.split(';') candidates[key] = cols return candidates
def _load_test(test_file): keys = [] sids = [] wids = [] test_sent = dict() with smart_ropen(test_file) as testf: for line in testf: (key, sid, wid, sent) = line.strip().split('\t') keys.append(key) sids.append(int(sid)) wids.append(int(wid)) test_sent[int(sid)] = sent.split(' ') return keys, sids, wids, test_sent
def read_naacl_distributions(naacl_path, x_lengths, y_lengths): """ Read NAACL-formatted alignment files. :param path: path to file :return: a list of pairs [sure set, possible set] each entry in the set maps an input position to an output position sentences start from 1 and a NULL token is indicated with position 0 """ with smart_ropen(naacl_path) as fi: data = [] current = None for i, line in enumerate(fi.readlines()): fields = line.split() if not fields: continue prob = 1.0 # by default we assume prob 1.0 if len(fields) < 3: raise ValueError('Missing required fields in line %d: %s' % (i, line.strip())) snt_id, x, y = int(fields[0]), int(fields[1]), int(fields[2]) if len(fields) == 5: prob = float(fields[4]) if len(fields) == 4: if fields[3] not in {'S', 'P'}: prob = float(fields[3]) if current is None or snt_id != current: data.append(defaultdict(list)) current = snt_id # make y 0-based # x is already 0-based (where 0 points to NULL) data[-1][y - 1].append((x, prob)) distributions = [] for ainfo, x_len, y_len in zip(data, x_lengths, y_lengths): dist = np.zeros((y_len, x_len), dtype=float) for y in range(y_len): if y not in ainfo: dist[y, 0] = 1. # align it to NULL else: for x, prob in ainfo[y]: dist[y, x] += prob normalisers = dist.sum(-1) #normalisers = np.where(np.not_equal(normalisers, 0), normalisers, np.ones(y_len)) dist /= np.expand_dims(normalisers, 1) distributions.append(dist) #print(y_len, x_len, dist) return distributions
def prepare_training( x_path, # data pre-processing nb_words=None, shortest_sequence=None, longest_sequence=None, # padding bos_str=None, eos_str=None, # normalisation lowercase=False, name='training') -> [Tokenizer, Text]: """ Construct vocabularies/tokenizers and memory-map the training data. :param x_path: :param y_path: :param nb_words: :param shortest_sequence: :param longest_sequence: :param bos_str: :param eos_str: :param name: :return: """ # Prepare vocabularies logging.info('Fitting vocabulary') tk = Tokenizer(nb_words=nb_words, bos_str=bos_str, eos_str=eos_str, lowercase=lowercase) tk.fit_one(smart_ropen(x_path)) logging.info(' vocab-size=%d', tk.vocab_size()) # Prepare training corpus logging.info('Memory mapping training data') training = Text(x_path, tokenizer=tk, shortest=shortest_sequence, longest=longest_sequence, trim=True, mask_dtype='float32', name=name) # in case the longest sequence was shorter than we thought longest_sequence = training.longest_sequence() logging.info(' training-samples=%d longest=%s tokens=%s', training.nb_samples(), longest_sequence, training.nb_tokens()) return tk, training
def test_text(input_path, output_path): """ Test the reconstruction of a corpus passing it through Tokenizer/Text pipeline. Example: text.test_text('data/en-fr/test.en-fr.en', 'data/en-fr/test.en-fr.en-mono') :param input_path: a text file :param output_path: where to save its reconstruction """ tk = Tokenizer() tk.fit_one(smart_ropen(input_path)) text = Text(input_path, tk) with open(output_path, 'w') as fo: for b, m in text.batch_iterator(100, shorter_batch='trim'): tk.write_as_text(b, fo) return text
def read_naacl_alignments(path, reverse=False): """ Read NAACL-formatted alignment files. :param path: path to file :param reverse: reverse links (that is, if input is x-y, output becomes= y-x) :return: a list of pairs [sure set, possible set] each entry in the set maps an input position to an output position sentences start from 1 and a NULL token is indicated with position 0 """ with smart_ropen(path) as fi: ainfo = {} for i, line in enumerate(fi.readlines()): fields = line.split() if not fields: continue sure = True # by default we assumed Sure links prob = 1.0 # by default we assume prob 1.0 if len(fields) < 3: raise ValueError('Missing required fields in line %d: %s' % (i, line.strip())) snt_id, x, y = int(fields[0]), int(fields[1]), int(fields[2]) if x == 0 or y == 0: # we ignore NULL links continue if reverse: x, y = y, x if len(fields) == 5: sure = fields[3] == 'S' prob = float(fields[4]) if len(fields) == 4: if fields[3] in {'S', 'P'}: sure = fields[3] == 'S' else: prob = float(fields[3]) snt_info = ainfo.get(snt_id, None) if snt_info is None: snt_info = [set(), set()] # S and P sets ainfo[snt_id] = snt_info if sure: # Note that S links are also P links: http://dl.acm.org/citation.cfm?id=992810 snt_info[0].add((x, y)) snt_info[1].add((x, y)) else: snt_info[1].add((x, y)) return tuple(v for k, v in sorted(ainfo.items(), key=lambda pair: pair[0]))
def prepare_training3d( x_path, y_path, # data pre-processing nb_chars=[None, None], longest_word=[None, None], shortest_sequence=[None, None], longest_sequence=[None, None], # padding bos_str=[None, None], eos_str=[None, None], # normalisation lowercase=False, batch_dtype='int32', mask_dtype='bool', name='training') -> [list, Multitext3D]: """ Construct vocabularies/tokenizers and memory-map the training data. :param x_path: :param y_path: :param nb_words: :param shortest_sequence: :param longest_sequence: :param bos_str: :param eos_str: :param name: :return: """ training_paths = [x_path, y_path] # Prepare vocabularies logging.info('Fitting (char) vocabularies') tks = [] for i, (path, vs, bos, eos, longword) in enumerate( zip(training_paths, nb_chars, bos_str, eos_str, longest_word)): logging.info(' stream=%d', i) # tokenizer with a bounded vocabulary tks.append( Tokenizer(nb_words=vs, bos_str=bos, eos_str=eos, lowercase=lowercase, mode='chars', longest_token=longword)) tks[-1].fit_one(smart_ropen(path)) logging.info(' (char) vocab-size=%d', tks[-1].vocab_size()) # Prepare training corpus logging.info('Memory mapping (char) training data') training = Multitext3D(training_paths, tokenizers=tks, shortest=shortest_sequence, longest=longest_sequence, trim=[True, True], batch_dtype=batch_dtype, mask_dtype=mask_dtype, name=name) # in case the longest sequence was shorter than we thought longest_sequence = [ training.longest_sequence(0), training.longest_sequence(1) ] deepest_sequence = [ training.deepest_sequence(0), training.deepest_sequence(1) ] logging.info( ' training-samples=%d longest=%s deepest=%s tokens=%s', training.nb_samples(), longest_sequence, deepest_sequence, [training.nb_tokens(0), training.nb_tokens(1)]) return tks, training