def convert(source_target_file, field_fun, lowercase, logging_level=None, logging_queue=None): # TODO: all logging-related code to utils (with annotations) source_file, target_file = source_target_file logger = setup_queue_logger(logging_level, logging_queue) logger.info('Started processing {}'.format(source_file)) with openall(source_file) as inf, openall(target_file, 'wt') as outf: sentence = [] for line_no, line in enumerate(inf): try: line = line.strip() if len(line) > 0: fields = line.split("\t") if lowercase: fields[WORD] = fields[WORD].lower() fields[LEMMA] = fields[LEMMA].lower() sentence.extend(field_fun(fields)) else: if len(sentence) > 0: print(u' '.join(sentence), file=outf) sentence = [] except Exception as e: logger.exception('Exception in {}:{}, line `{}: {}`'.format( source_file, line_no, line, e)) # raise return if len(sentence) > 0: print(u' '.join(sentence), file=outf) logger.info('Done processing {}'.format(source_file))
def main(): args = parse_arguments() os.nice(20) # Play nice if not os.path.isdir(args.target_dir): os.makedirs(args.target_dir) logger = setup_stream_logger(args.log_level) source_target_files = source_target_file_list(args.source_dir, args.target_dir) seen = set() for sf, tf in source_target_files: logger.info('Processing {}...'.format(sf)) s_read, s_written = 0, 0 with openall(sf) as inf, openall(tf, 'wt') as outf: for sentence in read_conll(inf): s_read += 1 key = u' '.join(map(itemgetter(args.field), sentence)) if key not in seen: seen.add(key) write_conll(sentence, outf) s_written += 1 # Delete target file if it consists entirely of duplicate sentences if s_written == 0: os.remove(tf) logger.info('Processed {}: read {} sentences; written {}.'.format( sf, s_read, s_written)) logger.info('Done; found {} unique sentences.'.format(len(seen)))
def unk_first(source_files, pos_files, target_files, n): """ Replaces the first n occurrences of each word. Reads the source files in a round-robin fashion, so that the distribution is balanced. """ infs = [openall(sf) for sf in source_files] outfs = [openall(tf, 'wt') for tf in target_files] posfs = [openall(pf) for pf in pos_files] seen_it = Counter() while len(infs): for f in range(len(infs) - 1, -1, -1): il = infs[f].readline() pl = posfs[f].readline() if posfs[f] else None if not il: infs[f].close() if posfs[f]: posfs[f].close() outfs[f].close() del infs[f] del posfs[f] del outfs[f] else: tokens = il.strip().split() ptokens = pl.strip().split() if pl else None for i, token in enumerate(tokens): seen_it[token] += 1 if seen_it[token] <= n: if ptokens: tokens[i] = '<unk-{}>'.format(ptokens[i].lower()) else: tokens[i] = '<unk>' print >> outfs[f], ' '.join(tokens)
def files_to_streams(self, input_file, output_file): if os.path.basename(input_file) == 'off_hu_jrc.xml': extractor = self.__extract_word_per_line else: extractor = self.__extract_text with openall(input_file, 'rb') as inf: with openall(output_file, 'wt', encoding='utf-8') as outf: yield extractor(inf), outf
def files_to_streams(self, input_file, output_file): """ Reads input_file according to the corpus format (compressed / not). In the former case, modifies the output_file name so that the '.tar' part is not included in it. """ with openall(input_file, 'rb') as inf: with openall(output_file, 'wt', encoding='utf-8') as outf: yield self.parse_xml(inf), outf
def __init__(self, file_name, max_lines, wait_for_empty=True): self.file_name = file_name self.max_lines = max_lines self.wait_for_empty = wait_for_empty self.index = 1 self.lines = 0 self.f = openall(self.__get_file_name(), 'wt')
def __iter__(self): for q_step in range(len(self.queues[0])): infs = [ openall(self.queues[i][q_step]) for i in range(self.batch_size) ] arr = np.zeros((self.batch_size, self.num_steps + 1), dtype=self.data_type) arr[:, -1:] = np.array(self.__read_from_infs(infs, 1)) for i in range(self.epoch_size // len(self.queues[0])): arr[:, 0] = arr[:, -1] arr[:, 1:] = np.array(self.__read_from_infs(infs, self.num_steps)) if self.one_hot: ret = np.zeros( (self.batch_size, self.num_steps, len(self.vocab)), dtype=self.data_type) ret[list(np.indices(ret.shape[:-1])) + [arr]] = 1 # for i in range(ret.shape[0]): # for j in range(ret.shape[1]): # ret[i, j, arr[i, j]] = 1 else: ret = arr yield ret[:, :self.num_steps], ret[:, 1:] for inf in infs: inf.close()
def write_output(self, *args): self.write_header() out_ext = digits_format_str(self.batch_size) for i in range(self.batch_size): with openall(self.output_prefix + out_ext.format(i), 'wt') as outf: for sentence in self.batches[i]: print(' '.join(sentence), file=outf)
def read_input(self): """Reads all the input into memory.""" # TODO back to the iterator ret = [] for input_file in self.input_files: with openall(input_file) as inf: ret.extend([line.split() + ['</s>'] for line in inf]) return ret
def count_input(input_files): """Counts the token types in the input files.""" vocab = Counter() eos = 0 for input_file in input_files: with openall(input_file) as inf: for line in inf: eos += 1 tokens = line.strip().split() vocab.update(tokens) vocab['</s>'] = eos return vocab
def data_loader(header, batch_size, num_steps, one_hot=False, data_type=np.int32, vocab_file=None): with openall(header) as inf: format, _, data_batches, _, data_len = inf.readline().strip().split( '\t') if format == 'txt': cls = TxtDiskLoader elif format == 'int': cls = IntMemLoader return cls(header, batch_size, num_steps, int(data_len), int(data_batches), one_hot, data_type, vocab_file)
def __new_file(self): """ Opens the next file, resets the line counter and renames all previous files if we need a new digit. """ self.f.close() digits = int(math.log10(self.index)) + 1 self.index += 1 new_digits = int(math.log10(self.index)) + 1 if new_digits > digits: for i in range(1, self.index): os.rename(self.__get_file_name(i, digits), self.__get_file_name(i, new_digits)) self.f = openall(self.__get_file_name(), 'wt') self.lines = 0
def _replace_file(in_out_files): in_file, pos_file, out_file = in_out_files if pos_file: with openall(in_file) as inf, openall(pos_file) as pinf, \ openall(out_file, 'wt') as outf: # noqa for line_no, line in enumerate(inf): lemmas = line.strip().split() poss = pinf.readline().strip().lower().split() try: print(' '.join( w if w in keep_words else '<unk-{}>'.format(poss[i]) for i, w in enumerate(lemmas)), file=outf) except: raise ValueError( 'Error in {}/{}:{} --- {}:{} vs {}:{}'.format( in_file, pos_file, line_no, len(lemmas), lemmas, len(poss), poss)) else: with openall(in_file) as inf, openall(out_file, 'wt') as outf: for line in inf: print(' '.join(w if w in keep_words else '<unk>' for w in line.strip().split()), file=outf)
def main(): source_dir, target_dir, outputs, seed = parse_arguments() if not os.path.isdir(target_dir): os.makedirs(target_dir) data = read_data(source_dir) random.seed(seed) random.shuffle(data) borders = [p if p is not None else 100 for _, p in outputs] for i in range(len(borders) - 1, -1, -1): for j in range(len(borders) - 1, i, -1): borders[j] += borders[i] borders = [0] + [int(len(data) * p / 100) for p in borders] for i, dataset in enumerate(ds for ds, _ in outputs): with openall(os.path.join(target_dir, dataset + '.gz'), 'wt') as outf: for l in data[borders[i]:borders[i + 1]]: print(l, file=outf)
def read_mappings(mapping_file): """ Reads the replacement mappings. I know JSON is ugly, but for (usually) int and float parameters it can be written nicely. """ string_mappings, list_mappings = [], [] with openall(mapping_file) as inf: d = json.load(inf) for k, v in sorted(d.items()): if isinstance(v, list): list_mappings.append((k, v)) elif isinstance(v, text): string_mappings.append((k, v)) else: raise ValueError( 'Unsupported value type ({}: {}) for key {}'.format( v, type(v), k)) if len(list_mappings) == 0: raise ValueError( 'No list replacements found in file {}.'.format(mapping_file)) return string_mappings, list_mappings
def _read(): for f in os.listdir(source_dir): with openall(os.path.join(source_dir, f)) as inf: for line in inf: yield line.strip()
def freqs_dict_to_file(file_name, freqs): with openall(file_name, 'wt') as outf: outf.write("\n".join("{}\t{}".format(word, freq) for word, freq in sorted(freqs.items())))
def freqs_file_to_dict(file_name): with openall(file_name) as inf: return { l[0]: int(l[1]) for l in map(lambda l: l.split(), map(str.strip, inf)) }
def _count_file(file_name): c = Counter() with openall(file_name) as inf: for line in inf: c.update(line.strip().split()) return c
def read_vocab(vocab_file): with openall(vocab_file) as inf: return [line.split('\t')[0] for line in inf]
def write_header(self, num_tokens=None): with openall(self.output_prefix, 'wt') as header: print('{}\t{}\t{}\t{}\t{}'.format( self.otype, self.ctype, self.batch_size, self.num_sentences(), num_tokens or self.aggregate_tokens()), file=header)
def enumerate_file(corp_file): with openall(corp_file, encoding='iso-8859-2') as inf: yield inf
def read_vocab(vocab_file): with openall(vocab_file) as inf: return { token_freq.split('\t')[0]: i for i, token_freq in enumerate(inf.read().strip().split('\n')) }
def outstream(self, output_file): if self.max_lines: return MultiFileWriter(output_file, self.max_lines) else: return openall(output_file, 'wt')
def __read_vocab(self, vocab_file): with openall(vocab_file) as inf: return { w: i for i, w in enumerate(l.strip().split('\t')[0] for l in inf) }
def write_vocab(vocab, vocab_file): """Writes the vocabulary to file.""" with openall(vocab_file, 'wt') as outf: for token, freq in sorted(vocab.items(), key=lambda tf: (-tf[1], tf[0])): print('{}\t{}'.format(token, freq), file=outf)
def instream(self, input_file): return openall(input_file)