Пример #1
0
  def output_segment(self, segment):
    e = etree.Element('tu')
    e.set('srclang', TMUtils.list2str(segment.source_language))
    dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now())
    e.set('creationdate', dt)
    dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now())
    e.set('changedate', dt)
    if segment.tuid:
      e.set('tuid', str(segment.tuid))
    if segment.username:
      e.set('creationid', segment.username)

    if segment.industry:
      etree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = self.list2str(segment.industry)
    if segment.type:
      etree.SubElement(e, 'prop', {'type' : "tda-type"}).text = self.list2str(segment.type)
    if segment.organization:
      etree.SubElement(e, 'prop', {'type' : "tda-org"}).text = self.list2str(segment.organization)
      etree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default"
    if segment.metadata:
      for prop_type,prop_text in segment.metadata.items():
        if not prop_type.startswith('tda-'): # skip already handled props
          etree.SubElement(e, 'prop', {'type': prop_type}).text = prop_text

    for t in ['source', 'target']:
      tuv = etree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))})
      if getattr(segment, t + '_pos'):
        etree.SubElement(tuv, 'prop', {'type': "pos"}).text = getattr(segment, t + '_pos')
      if getattr(segment, t + '_metadata'):
        for prop_type, prop_text in getattr(segment, t + '_metadata').items():
          etree.SubElement(tuv, 'prop', {'type': prop_type}).text = prop_text

      etree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text')

    return e
Пример #2
0
    def _segment2doc(self, segment):
        # Initialize/update DB date fields
        now_str = TMUtils.date2str(datetime.datetime.now())
        if not segment.insert_date: segment.insert_date = now_str
        if not segment.check_date:
            segment.check_date = TMUtils.date2str(datetime.datetime(
                1970, 1, 1))
        segment.update_date = now_str

        return {
            'source_id': segment.source_id,
            'target_id': segment.target_id,
            'source_text': segment.source_text,
            'target_text': segment.target_text,
            'source_language': segment.source_language,
            'target_language': segment.target_language,
            'source_metadata': segment.source_metadata,
            'target_metadata': segment.target_metadata,
            'metadata': segment.metadata,
            'tuid': segment.tuid,
            'industry': TMUtils.str2list(segment.industry),
            'type': TMUtils.str2list(segment.type),
            'organization': TMUtils.str2list(segment.organization),
            'file_name': TMUtils.str2list(segment.file_name),
            'domain': TMUtils.str2list(segment.domain),
            'tm_creation_date': segment.tm_creation_date,
            'tm_change_date': segment.tm_change_date,
            'insert_date': segment.insert_date,
            'update_date': segment.update_date,
            'check_date': segment.check_date,
            'check_version': segment.check_version,
            'dirty_score': segment.dirty_score,
            'username': segment.username
        }
Пример #3
0
    def generate_pivot(self, sdoc, tdoc):
        if sdoc['source_id'] != tdoc['source_id']:
            logging.error(
                "Invalid pair for pivot generation: sdoc {}, tdoc {}".format(
                    sdoc, tdoc))
        assert (sdoc['source_id'] == tdoc['source_id']
                )  # make sure pivot exists
        # Result doc
        doc = dict()
        for attr in ['id', 'language', 'text']:
            doc['source_' + attr] = sdoc['target_' + attr]
            doc['target_' + attr] = tdoc['target_' + attr]
        for attr in TMDbQuery.str_attrs:
            if not attr in sdoc: continue
            # TODO: should it be union or intersection?
            doc[attr] = sdoc[attr] + tdoc[attr] if sdoc.get(attr) and tdoc.get(
                attr) else None

        for attr in [
                'tm_creation_date', 'tm_change_date', 'insert_date',
                'update_date'
        ]:
            doc[attr] = TMUtils.date2str(datetime.datetime.now())
        doc['check_date'] = TMUtils.date2str(datetime.datetime(1970, 1, 1))
        return doc
Пример #4
0
    def __call__(self, index, segments_iter):
        # Import should be inside the function to avoid serializing all pos tagger dependencies
        # for parallel execution
        sys.path.append(
            os.path.join(os.path.abspath(os.path.dirname(__file__)), '..',
                         '..'))
        sys.path = [p for p in sys.path if p]
        from TMPosTagger.TMPosTagger import TMPosTagger

        # Cache all segments. Though it might be expensive in terms of memory, but we need
        # to gather all texts for POS tagger batch and then store back
        # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently
        # large number of Spark jobs
        segments = [s for s in segments_iter]
        # Initialize PosTaggers for source and target languages
        pos_taggers = [
            TMPosTagger(lang.split('-')[0], universal=self.is_universal)
            for lang in self.langs
        ]
        # Invoke POS taggers for source and target segments
        src_texts = pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s.source_text) for s in segments])
        tgt_texts = pos_taggers[1].tag_segments(
            [XmlUtils.replace_tags(s.target_text) for s in segments])
        # Store POS tags with XML tags as a training data. TODO: make it optional
        f = open(
            tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        iobs = open(
            tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        for s, stext, ttext in zip(segments, src_texts, tgt_texts):
            s.source_pos = self.tags2string(stext)
            s.target_pos = self.tags2string(ttext)
            # Write POS tags (+XML tags) to text file to be used as a training data
            if re.match(XmlUtils.TAG_PATTERN, s.source_text):
                f.write("{}\n{}\n\n".format(
                    self.tags2string_xml_tags(s.source_text, stext),
                    self.tags2string_xml_tags(s.target_text, ttext)))
                for s, t in zip(
                        self.tags2string_iob_tags(s.source_text, stext),
                        self.tags2string_iob_tags(s.target_text, ttext)):
                    iobs.write("{}\n{}\n\n".format(s, t))

        f.close()
        iobs.close()
        return segments
Пример #5
0
 def init_job(self, job_id=None, username=None, type='default', **kwargs):
   doc = {
          'id': job_id,
          'type': type,
          'username': username,
          'status': 'pending',
          'submit_time': TMUtils.date2str(datetime.datetime.now())
          }
   if not job_id: job_id = self._allocate_id()
   # Put params into the doc
   doc['params'] = kwargs
   self.update_job(job_id, doc)
   return id
Пример #6
0
  def output_segment(self, segment):
    e = ElementTree.Element('tu')
    e.set('srclang', TMUtils.list2str(segment.source_language))
    dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now())
    e.set('creationdate', dt)
    dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now())
    e.set('changedate', dt)
    if segment.tuid:
      e.set('tuid', str(segment.tuid))

    if segment.industry:
      ElementTree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = segment.industry[0]
    if segment.type:  
      ElementTree.SubElement(e, 'prop', {'type' : "tda-type"}).text = segment.type[0]
    if segment.organization:
      ElementTree.SubElement(e, 'prop', {'type' : "tda-org"}).text = segment.organization[0]
    ElementTree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default"

    for t in ['source', 'target']:
      tuv = ElementTree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))})
      ElementTree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text')

    return e
Пример #7
0
 def __call__(self, index, segments_iter):
   for segment in segments_iter:
     segment.check_date = TMUtils.date2str(datetime.datetime.now())
     segment.check_version = self.version
     yield segment
Пример #8
0
 def finalize(self, job_id, status='finished'):
   doc = self.get_job(job_id)
   doc['end_time'] = TMUtils.date2str(datetime.datetime.now())
   doc['status'] = status
   self.update_job(job_id, doc)