Exemplo n.º 1
0
def main(fin_names, fout_name, split_num):
  """
  make training format for CRFsuite
  :param  fin_names:  list of input files
  :param  fout_name:  output file path
  :param  split_num:  split number of output file
  """
  fouts = []
  sizes = []
  if split_num > 1:
    fouts = [open('%s.%d' % (fout_name, idx), 'w') for idx in range(split_num)]
    sizes = [0] * split_num
  else:
    fouts = [open(fout_name, 'w')]
    sizes = [0]

  for sent in sejong_corpus.load(IS_SPOKEN, fin_names):
    if not sent.is_good_tags():
      logging.error('Invalid tag in sentence: %s', sent)
      continue
    sent_pairs = []
    for word in sent.words:
      try:
        word_pairs = sejong_align.align(word)
      except sejong_align.AlignError:
        sent_pairs = []
        break
      else:
        sent_pairs.append(word_pairs)
    if sent_pairs:
      surfaces = [surface.encode('UTF-8') for pairs in sent_pairs for surface, _ in pairs]
      new_size = sum([len(surface) for surface in surfaces])
      print_aligned(select_file(fouts, sizes, new_size), sent_pairs)
Exemplo n.º 2
0
def main(fin_names, fout):
    """
  make word(EoJeol) per line formatted corpus from Sejong tagged corpus
  :param  fin_names:  list of input files
  :param  fout:       output file
  """
    for sent in sejong_corpus.load(IS_SPOKEN, fin_names):
        print >> fout, sent
        print >> fout
Exemplo n.º 3
0
def main(fin_names, fout):
  """
  align syllables to morphemes in word(EoJeol)
  :param  fin_names:   list of input files
  :param  fout:        output file
  """
  for sent in sejong_corpus.load(IS_SPOKEN, fin_names):
    sent_pairs = []
    for surface in sent.words:
      try:
        word_pairs = sejong_align.align(surface)
      except sejong_align.AlignError:
        sent_pairs = []
        break
      else:
        sent_pairs.extend(word_pairs)
    if sent_pairs:
      print_aligned(fout, sent_pairs)