예제 #1
0
def convert(source_target_file,
            field_fun,
            lowercase,
            logging_level=None,
            logging_queue=None):
    # TODO: all logging-related code to utils (with annotations)
    source_file, target_file = source_target_file
    logger = setup_queue_logger(logging_level, logging_queue)

    logger.info('Started processing {}'.format(source_file))
    with openall(source_file) as inf, openall(target_file, 'wt') as outf:
        sentence = []
        for line_no, line in enumerate(inf):
            try:
                line = line.strip()
                if len(line) > 0:
                    fields = line.split("\t")
                    if lowercase:
                        fields[WORD] = fields[WORD].lower()
                        fields[LEMMA] = fields[LEMMA].lower()
                    sentence.extend(field_fun(fields))
                else:
                    if len(sentence) > 0:
                        print(u' '.join(sentence), file=outf)
                        sentence = []
            except Exception as e:
                logger.exception('Exception in {}:{}, line `{}: {}`'.format(
                    source_file, line_no, line, e))
                # raise
                return
        if len(sentence) > 0:
            print(u' '.join(sentence), file=outf)
    logger.info('Done processing {}'.format(source_file))
예제 #2
0
파일: uniq_corpus.py 프로젝트: nytud/emLam
def main():
    args = parse_arguments()
    os.nice(20)  # Play nice
    if not os.path.isdir(args.target_dir):
        os.makedirs(args.target_dir)

    logger = setup_stream_logger(args.log_level)
    source_target_files = source_target_file_list(args.source_dir,
                                                  args.target_dir)

    seen = set()
    for sf, tf in source_target_files:
        logger.info('Processing {}...'.format(sf))
        s_read, s_written = 0, 0
        with openall(sf) as inf, openall(tf, 'wt') as outf:
            for sentence in read_conll(inf):
                s_read += 1
                key = u' '.join(map(itemgetter(args.field), sentence))
                if key not in seen:
                    seen.add(key)
                    write_conll(sentence, outf)
                    s_written += 1
        # Delete target file if it consists entirely of duplicate sentences
        if s_written == 0:
            os.remove(tf)
        logger.info('Processed {}: read {} sentences; written {}.'.format(
            sf, s_read, s_written))

    logger.info('Done; found {} unique sentences.'.format(len(seen)))
예제 #3
0
def unk_first(source_files, pos_files, target_files, n):
    """
    Replaces the first n occurrences of each word. Reads the source files in
    a round-robin fashion, so that the distribution is balanced.
    """
    infs = [openall(sf) for sf in source_files]
    outfs = [openall(tf, 'wt') for tf in target_files]
    posfs = [openall(pf) for pf in pos_files]
    seen_it = Counter()
    while len(infs):
        for f in range(len(infs) - 1, -1, -1):
            il = infs[f].readline()
            pl = posfs[f].readline() if posfs[f] else None
            if not il:
                infs[f].close()
                if posfs[f]:
                    posfs[f].close()
                outfs[f].close()
                del infs[f]
                del posfs[f]
                del outfs[f]
            else:
                tokens = il.strip().split()
                ptokens = pl.strip().split() if pl else None
                for i, token in enumerate(tokens):
                    seen_it[token] += 1
                    if seen_it[token] <= n:
                        if ptokens:
                            tokens[i] = '<unk-{}>'.format(ptokens[i].lower())
                        else:
                            tokens[i] = '<unk>'
                print >> outfs[f], ' '.join(tokens)
예제 #4
0
파일: hu_mnsz2.py 프로젝트: nytud/emLam
 def files_to_streams(self, input_file, output_file):
     if os.path.basename(input_file) == 'off_hu_jrc.xml':
         extractor = self.__extract_word_per_line
     else:
         extractor = self.__extract_text
     with openall(input_file, 'rb') as inf:
         with openall(output_file, 'wt', encoding='utf-8') as outf:
             yield extractor(inf), outf
예제 #5
0
 def files_to_streams(self, input_file, output_file):
     """
     Reads input_file according to the corpus format (compressed / not). In
     the former case, modifies the output_file name so that the '.tar' part
     is not included in it.
     """
     with openall(input_file, 'rb') as inf:
         with openall(output_file, 'wt', encoding='utf-8') as outf:
             yield self.parse_xml(inf), outf
예제 #6
0
 def __init__(self, file_name, max_lines, wait_for_empty=True):
     self.file_name = file_name
     self.max_lines = max_lines
     self.wait_for_empty = wait_for_empty
     self.index = 1
     self.lines = 0
     self.f = openall(self.__get_file_name(), 'wt')
예제 #7
0
파일: data_input.py 프로젝트: nytud/emLam
 def __iter__(self):
     for q_step in range(len(self.queues[0])):
         infs = [
             openall(self.queues[i][q_step]) for i in range(self.batch_size)
         ]
         arr = np.zeros((self.batch_size, self.num_steps + 1),
                        dtype=self.data_type)
         arr[:, -1:] = np.array(self.__read_from_infs(infs, 1))
         for i in range(self.epoch_size // len(self.queues[0])):
             arr[:, 0] = arr[:, -1]
             arr[:,
                 1:] = np.array(self.__read_from_infs(infs, self.num_steps))
             if self.one_hot:
                 ret = np.zeros(
                     (self.batch_size, self.num_steps, len(self.vocab)),
                     dtype=self.data_type)
                 ret[list(np.indices(ret.shape[:-1])) + [arr]] = 1
                 # for i in range(ret.shape[0]):
                 #     for j in range(ret.shape[1]):
                 #         ret[i, j, arr[i, j]] = 1
             else:
                 ret = arr
             yield ret[:, :self.num_steps], ret[:, 1:]
         for inf in infs:
             inf.close()
예제 #8
0
 def write_output(self, *args):
     self.write_header()
     out_ext = digits_format_str(self.batch_size)
     for i in range(self.batch_size):
         with openall(self.output_prefix + out_ext.format(i), 'wt') as outf:
             for sentence in self.batches[i]:
                 print(' '.join(sentence), file=outf)
예제 #9
0
 def read_input(self):
     """Reads all the input into memory."""
     # TODO back to the iterator
     ret = []
     for input_file in self.input_files:
         with openall(input_file) as inf:
             ret.extend([line.split() + ['</s>'] for line in inf])
     return ret
예제 #10
0
def count_input(input_files):
    """Counts the token types in the input files."""
    vocab = Counter()
    eos = 0
    for input_file in input_files:
        with openall(input_file) as inf:
            for line in inf:
                eos += 1
                tokens = line.strip().split()
                vocab.update(tokens)
    vocab['</s>'] = eos
    return vocab
예제 #11
0
파일: data_input.py 프로젝트: nytud/emLam
def data_loader(header,
                batch_size,
                num_steps,
                one_hot=False,
                data_type=np.int32,
                vocab_file=None):
    with openall(header) as inf:
        format, _, data_batches, _, data_len = inf.readline().strip().split(
            '\t')
        if format == 'txt':
            cls = TxtDiskLoader
        elif format == 'int':
            cls = IntMemLoader
    return cls(header, batch_size, num_steps, int(data_len), int(data_batches),
               one_hot, data_type, vocab_file)
예제 #12
0
 def __new_file(self):
     """
     Opens the next file, resets the line counter and renames all previous
     files if we need a new digit.
     """
     self.f.close()
     digits = int(math.log10(self.index)) + 1
     self.index += 1
     new_digits = int(math.log10(self.index)) + 1
     if new_digits > digits:
         for i in range(1, self.index):
             os.rename(self.__get_file_name(i, digits),
                       self.__get_file_name(i, new_digits))
     self.f = openall(self.__get_file_name(), 'wt')
     self.lines = 0
예제 #13
0
def _replace_file(in_out_files):
    in_file, pos_file, out_file = in_out_files
    if pos_file:
        with openall(in_file) as inf, openall(pos_file) as pinf, \
             openall(out_file, 'wt') as outf:  # noqa
            for line_no, line in enumerate(inf):
                lemmas = line.strip().split()
                poss = pinf.readline().strip().lower().split()
                try:
                    print(' '.join(
                        w if w in keep_words else '<unk-{}>'.format(poss[i])
                        for i, w in enumerate(lemmas)),
                          file=outf)
                except:
                    raise ValueError(
                        'Error in {}/{}:{} --- {}:{} vs {}:{}'.format(
                            in_file, pos_file, line_no, len(lemmas), lemmas,
                            len(poss), poss))
    else:
        with openall(in_file) as inf, openall(out_file, 'wt') as outf:
            for line in inf:
                print(' '.join(w if w in keep_words else '<unk>'
                               for w in line.strip().split()),
                      file=outf)
예제 #14
0
def main():
    source_dir, target_dir, outputs, seed = parse_arguments()
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)

    data = read_data(source_dir)
    random.seed(seed)
    random.shuffle(data)

    borders = [p if p is not None else 100 for _, p in outputs]
    for i in range(len(borders) - 1, -1, -1):
        for j in range(len(borders) - 1, i, -1):
            borders[j] += borders[i]
    borders = [0] + [int(len(data) * p / 100) for p in borders]

    for i, dataset in enumerate(ds for ds, _ in outputs):
        with openall(os.path.join(target_dir, dataset + '.gz'), 'wt') as outf:
            for l in data[borders[i]:borders[i + 1]]:
                print(l, file=outf)
예제 #15
0
def read_mappings(mapping_file):
    """
    Reads the replacement mappings. I know JSON is ugly, but for (usually)
    int and float parameters it can be written nicely.
    """
    string_mappings, list_mappings = [], []
    with openall(mapping_file) as inf:
        d = json.load(inf)
        for k, v in sorted(d.items()):
            if isinstance(v, list):
                list_mappings.append((k, v))
            elif isinstance(v, text):
                string_mappings.append((k, v))
            else:
                raise ValueError(
                    'Unsupported value type ({}: {}) for key {}'.format(
                        v, type(v), k))
    if len(list_mappings) == 0:
        raise ValueError(
            'No list replacements found in file {}.'.format(mapping_file))
    return string_mappings, list_mappings
예제 #16
0
 def _read():
     for f in os.listdir(source_dir):
         with openall(os.path.join(source_dir, f)) as inf:
             for line in inf:
                 yield line.strip()
예제 #17
0
def freqs_dict_to_file(file_name, freqs):
    with openall(file_name, 'wt') as outf:
        outf.write("\n".join("{}\t{}".format(word, freq)
                             for word, freq in sorted(freqs.items())))
예제 #18
0
def freqs_file_to_dict(file_name):
    with openall(file_name) as inf:
        return {
            l[0]: int(l[1])
            for l in map(lambda l: l.split(), map(str.strip, inf))
        }
예제 #19
0
def _count_file(file_name):
    c = Counter()
    with openall(file_name) as inf:
        for line in inf:
            c.update(line.strip().split())
    return c
예제 #20
0
def read_vocab(vocab_file):
    with openall(vocab_file) as inf:
        return [line.split('\t')[0] for line in inf]
예제 #21
0
 def write_header(self, num_tokens=None):
     with openall(self.output_prefix, 'wt') as header:
         print('{}\t{}\t{}\t{}\t{}'.format(
             self.otype, self.ctype, self.batch_size, self.num_sentences(),
             num_tokens or self.aggregate_tokens()),
               file=header)
예제 #22
0
 def enumerate_file(corp_file):
     with openall(corp_file, encoding='iso-8859-2') as inf:
         yield inf
예제 #23
0
파일: data_input.py 프로젝트: nytud/emLam
 def read_vocab(vocab_file):
     with openall(vocab_file) as inf:
         return {
             token_freq.split('\t')[0]: i
             for i, token_freq in enumerate(inf.read().strip().split('\n'))
         }
예제 #24
0
파일: corpus_base.py 프로젝트: nytud/emLam
 def outstream(self, output_file):
     if self.max_lines:
         return MultiFileWriter(output_file, self.max_lines)
     else:
         return openall(output_file, 'wt')
예제 #25
0
 def __read_vocab(self, vocab_file):
     with openall(vocab_file) as inf:
         return {
             w: i
             for i, w in enumerate(l.strip().split('\t')[0] for l in inf)
         }
예제 #26
0
def write_vocab(vocab, vocab_file):
    """Writes the vocabulary to file."""
    with openall(vocab_file, 'wt') as outf:
        for token, freq in sorted(vocab.items(),
                                  key=lambda tf: (-tf[1], tf[0])):
            print('{}\t{}'.format(token, freq), file=outf)
예제 #27
0
파일: corpus_base.py 프로젝트: nytud/emLam
 def instream(self, input_file):
     return openall(input_file)