Пример #1
0
 def check_query_parameters(self):
   if 'pos' not in self.query_dic:  # Applied pos and universal on query --> only the firt time
     if 'tokenizer' not in self.query_dic:  # The first transformation is posTag --> any other were applied
       query_out_tags = XmlUtils.replace_tags(self.query_dic['query'])
       self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
     self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
   return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
Пример #2
0
    def __call__(self, index, segments_iter):
        # Import should be inside the function to avoid serializing all pos tagger dependencies
        # for parallel execution
        sys.path.append(
            os.path.join(os.path.abspath(os.path.dirname(__file__)), '..',
                         '..'))
        sys.path = [p for p in sys.path if p]
        from TMPosTagger.TMPosTagger import TMPosTagger

        # Cache all segments. Though it might be expensive in terms of memory, but we need
        # to gather all texts for POS tagger batch and then store back
        # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently
        # large number of Spark jobs
        segments = [s for s in segments_iter]
        # Initialize PosTaggers for source and target languages
        pos_taggers = [
            TMPosTagger(lang.split('-')[0], universal=self.is_universal)
            for lang in self.langs
        ]
        # Invoke POS taggers for source and target segments
        src_texts = pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s.source_text) for s in segments])
        tgt_texts = pos_taggers[1].tag_segments(
            [XmlUtils.replace_tags(s.target_text) for s in segments])
        # Store POS tags with XML tags as a training data. TODO: make it optional
        f = open(
            tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        iobs = open(
            tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        for s, stext, ttext in zip(segments, src_texts, tgt_texts):
            s.source_pos = self.tags2string(stext)
            s.target_pos = self.tags2string(ttext)
            # Write POS tags (+XML tags) to text file to be used as a training data
            if re.match(XmlUtils.TAG_PATTERN, s.source_text):
                f.write("{}\n{}\n\n".format(
                    self.tags2string_xml_tags(s.source_text, stext),
                    self.tags2string_xml_tags(s.target_text, ttext)))
                for s, t in zip(
                        self.tags2string_iob_tags(s.source_text, stext),
                        self.tags2string_iob_tags(s.target_text, ttext)):
                    iobs.write("{}\n{}\n\n".format(s, t))

        f.close()
        iobs.close()
        return segments
Пример #3
0
    def __call__(self, s_txt, t_txt):
        s_tags = XmlUtils.extract_tags(s_txt)
        if not s_tags: return t_txt

        t_tags = XmlUtils.extract_tags(t_txt)
        # Number of tags is equal - just replace one by one
        if len(s_tags) == len(t_tags):
            for s_tag, t_tag in zip(s_tags, t_tags):
                t_txt = t_txt.replace(t_tag, s_tag, 1)
            return t_txt
        else:
            s_toks = TMTextProcessors.tokenizer(
                self.langs[0]).tokenizer.process(
                    XmlUtils.replace_tags(XmlUtils.fix_tags(s_txt)[0],
                                          adjacent_space_placeholder=XmlUtils.
                                          SPACE_PLACEHOLDER)).split()
            # TODO: s_universal = self._preprocess(s_toks, self.langs[0])
            # Strip all tags from target text before tokenizing it
            t_toks = TMTextProcessors.tokenizer(
                self.langs[1]).tokenizer.process(
                    XmlUtils.strip_tags(t_txt)).split()
            #TODO: t_universal = self._preprocess(t_toks, self.langs[1])
            t_toks_new = []
            # Iterate over tokenized source and target text and apply simple alighnment algorithm (by token).
            # Insert source tags at the aligned places in the target text
            ti = 0
            for si in range(0, len(s_toks)):
                count = 1  # init
                if s_toks[si] == XmlUtils.TAG_PLACEHOLDER:
                    t_toks_new.append(s_tags.pop(0))
                elif s_toks[si] == XmlUtils.SPACE_PLACEHOLDER:
                    t_toks_new.append(XmlUtils.SPACE_PLACEHOLDER)
                elif ti < len(t_toks):
                    t_toks_new.append(t_toks[ti])
                    ti += 1
                else:
                    break  # source is longer than target, stop here
            # Append remaining target tokens
            if ti < len(t_toks): t_toks_new += t_toks[ti:]
            # If not all tags have been aligned, just contatenate remaining ones to the end
            if s_tags: t_toks_new += s_tags
        # Join tokenized text into string. TODO: implement as a part of TMTokenizer class (language-dependent)
        # return self.tok[1].join(t_toks_new)
        ttext_with_tags = XmlUtils.join_tags(
            ' '.join(t_toks_new), '(</?[^<>]+/?>)([^<>]+)(</?[^<>]+/?>)'
        )  # --> join words with tags <b> this </b> --> <b>this</b>
        # Handle whitespaces which are adjacent to tags
        ttext_with_tags = re.sub('\s+<', '<', ttext_with_tags)
        ttext_with_tags = re.sub('>\s+', '>', ttext_with_tags)
        ttext_with_tags = re.sub(XmlUtils.SPACE_PLACEHOLDER, '',
                                 ttext_with_tags)
        return ttext_with_tags
Пример #4
0
    def __call__(self, s_txt, t_txt):
        # Extract source tags to be transferred: ['<X[1]>', '</X[1]>']
        print("Source text: {}".format(s_txt))
        s_tags = XmlUtils.extract_tags(s_txt)
        print("Source tags: {}".format(s_tags))
        if not s_tags: return t_txt
        # Remove any tags from the target
        t_txt = XmlUtils.strip_tags(t_txt)

        # Rename tags to avoid problems in XML parser
        # I have <X[1]>a dog</X[1]> ---> I have <T1>a dog</T1>
        s_txt_fixed = XmlUtils.simplify_tags(s_txt)
        s_tags_fixed = XmlUtils.extract_tags(s_txt_fixed)
        print("Fixed source tags: {}".format(s_tags_fixed))
        # Keep mapping of fixed tags to original tags for the final recovery:
        # tags_map = {'<T1>: '<X[1]>', '</T1>': '</X[1]>'}
        assert len(s_tags_fixed) == len(s_tags)
        tags_map = dict(zip(s_tags_fixed, s_tags))
        print("Tags map: {}".format(tags_map))

        # Run POS tagging (before, replace XML tags with a placeholder in the source text):
        # I chase <T1>a dog</T1> --> I chase ELASTICTMTAG a dog ELASTICTMTAG
        # --> I/NOUN have/VERB ELASTICTMTAG/NOUN a/DET dog/NOUN ELASTICTMTAG/NOUN
        s_pos = self.pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s_txt_fixed)])[0]
        t_pos = self.pos_taggers[1].tag_segments([t_txt])[0]
        # Recover fixed tags:
        # I,NOUN have,VERB ELASTICTMTAG,NOUN a,DET dog,NOUN ELASTICTMTAG,NOUN
        # ---> NOUN VERB <T1> DET NOUN </T1>
        s_pos_with_tags, s_pos = XmlUtils.recover_tags_pos(s_pos, s_tags_fixed)
        print("S_POS_WITH_TAGS: {}, S_POS: {}, T_POS: {}".format(
            s_pos_with_tags, s_pos, t_pos))
        # For each tag (T1, T2 etc.), remove other tags and run prediction algorithm, based on IOB tags. Return value
        # is a map of tags to their correspondent indexes in target (tokenized) text
        tag2t_index = self.tags2indexes(s_tags_fixed, s_pos_with_tags, s_pos,
                                        [t[1] for t in t_pos])

        # Place tags at predicted indexes in the target text
        t_txt_with_tags = self.place_tags(s_tags_fixed, tag2t_index, tags_map,
                                          t_pos)
        if not t_txt_with_tags: return None
        # TODO: join using language-specific "joiner" (opposite of tokenizer)
        return " ".join(t_txt_with_tags)
Пример #5
0
  def _validate_pipe(self, pipe):
    match_process = {
      'regex': None,
      'posTag': None,
      'tags': TMTags()
    }

    try:
      match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang)
      logging.info("Loading regex for matching")
    except ValueError:
      if 'regex' in pipe:
        pipe.pop(pipe.index('regex'))
        logging.info("Unsupported regex for matching")

    query_out_tags = XmlUtils.replace_tags(self.query)

    try:
      if 'tokenizer' not in self.query_dic:
        self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
      logging.info("Loading Tokenizer for {}".format(self.src_lang))

      try:
        if 'pos' not in self.query_dic:
          self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
        match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang)
        logging.info("Loading regex for matching")
      except Exception as e:
        if 'posTag' in pipe:
          pipe.pop(pipe.index('posTag'))
          logging.info("Unsupported posTag for matching")
    except Exception as e:
      if 'posTag' in pipe:
        pipe.pop(pipe.index('posTag'))
        logging.info("Unsupported Tokenizer for {}".format(self.src_lang))

    return match_process, pipe