Пример #1
0
  def _fill_lang(self, tuv, seg):
    d = dict()
    d['source_language'] = TMUtils.lang2short(self._get_lang(tuv[0]))#tuv[0].attrib.get('lang')#get('{%s}lang' % self.NS)
    d['target_language'] = TMUtils.lang2short(self._get_lang(tuv[1])) #tuv[1].attrib.get('lang')#get('{%s}lang' % self.NS)

    d['source_text'] = self._get_text(seg[0])
    if isinstance(d['source_text'], bytes):
      d['source_text'] = d['source_text'].decode('utf8').encode('utf8')

    d['target_text'] = self._get_text(seg[1])
    if isinstance(d['target_text'], bytes):
      d['target_text'] = d['target_text'].decode('utf8').encode('utf8')

    d['source_metadata'] = self._parse_metadata(tuv[0])
    d['target_metadata'] = self._parse_metadata(tuv[1])
    return d
Пример #2
0
  def _segment2doc(self, segment, ftype):
    text_pos = getattr(segment, ftype + '_pos')
    doc = {'text': getattr(segment, ftype + '_text')}
    # Optional fields (POS, tokenized)
    if hasattr(segment, ftype + '_pos'):
      doc['pos'] = getattr(segment, ftype + '_pos')

    op_ftype = 'source' if ftype == 'target' else 'target'
    # Auxiliary field to facilitate language matrix generation
    doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])]
    doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language'))
    return doc
Пример #3
0
  def _gen_lang_pairs(self, tuv, seg):
    # Get all languages in the given tu
    lang_map = dict()
    for tu, seg in zip(tuv, seg):
      #lang = TMUtils.lang2short(tu.attrib.get('{%s}lang' % self.NS))
      lang = TMUtils.lang2short((self._get_lang(tu)))#(tu.attrib.get('lang'))
      if not lang in lang_map: lang_map[lang] = []
      lang_map[lang].append((tu, seg))

    # Generate all requested pairs (note: one tu can contain multiple translations for
    # the same language
    for s_lang, t_lang in self.lang_pairs:
      for s_tuv, s_seg in lang_map.get(s_lang, []):
        for t_tuv, t_seg in lang_map.get(t_lang, []):
          yield self._fill_lang((s_tuv, t_tuv), (s_seg, t_seg))