예제 #1
0
def init_dataset(src_path, tgt_path, src_out, tgt_out):
    with open(src_path) as fp_in_src, open(tgt_path) as fp_in_tgt:
        with open(src_out, 'w') as fp_out_src, open(tgt_out,
                                                    'w') as fp_out_tgt:
            for src_l, tgt_l in tqdm(zip(fp_in_src, fp_in_tgt)):
                src, tgt = Sentence.deserialize(src_l), Sentence.deserialize(
                    tgt_l)
                fp_out_src.write("{}|{}\n".format(src.source, tgt.key))
                fp_out_tgt.write("{}\n".format(tgt.source))
예제 #2
0
    def iter_operators(self, apply_direction=True):
        if not self.cache_exists:
            for fr, en in self:
                self._apply_operators(fr)
                self._apply_operators(en)

                if apply_direction:
                    yield self._apply_direction(fr, en)
                else:
                    yield fr, en
        else:
            with open(self.cache_fr) as fp_fr, open(self.cache_en) as fp_en:
                for fr_l, en_l in zip(fp_fr, fp_en):
                    fr = Sentence.deserialize(fr_l)
                    en = Sentence.deserialize(en_l)

                    if apply_direction:
                        yield self._apply_direction(fr, en)
                    else:
                        yield fr, en
예제 #3
0
    def find(self, sentence):
        with self.connection.cursor() as c:
            c.execute("""SELECT TU_tgt.mapping 
                               FROM (
                                        SELECT translations.id_{} as id
                                        FROM translations_units 
                                        INNER JOIN translations 
                                        ON translations.id_{} = translations_units.id AND
                                           translations_units.text_hash = {} AND translations_units.language = {}
                                    ) as trans                         
                               INNER JOIN translations_units as TU_tgt 
                               ON trans.id = TU_tgt.id
                               ORDER BY TU_tgt.id DESC""".format('en' if sentence.language == 'fr' else 'fr',
                                                                 sentence.language, self.C, self.C),
                            (self.hash(sentence.key), sentence.language))

            return [Sentence.deserialize(t['mapping']) for t in c.fetchall()]