Exemplo n.º 1
0
def generate_answers(config, model, processor, qn_uuid_data,
                     context_token_data, qn_token_data):
    uuid2ans = {}  # maps uuid to string containing predicted answer
    data_size = len(qn_uuid_data)
    num_batches = ((data_size - 1) / config.batch_size) + 1
    batch_num = 0
    detokenizer = MosesDetokenizer()

    print("Generating answers...")

    for batch in get_batch_generator(processor.word2id, qn_uuid_data,
                                     context_token_data, qn_token_data,
                                     config.batch_size, config.context_len,
                                     config.question_len):

        # Get the predicted spans
        pred_start_batch, pred_end_batch = processor.test_one_batch(
            batch, model)

        # Convert pred_start_batch and pred_end_batch to lists length batch_size
        pred_start_batch = pred_start_batch.tolist()
        pred_end_batch = pred_end_batch.tolist()

        # For each example in the batch:
        for ex_idx, (pred_start, pred_end) in enumerate(
                zip(pred_start_batch, pred_end_batch)):

            # Original context tokens (no UNKs or padding) for this example
            context_tokens = batch.context_tokens[ex_idx]  # list of strings

            # Check the predicted span is in range
            assert pred_start in range(len(context_tokens))
            assert pred_end in range(len(context_tokens))

            # Predicted answer tokens
            pred_ans_tokens = context_tokens[pred_start:pred_end +
                                             1]  # list of strings

            # Detokenize and add to dict
            uuid = batch.uuids[ex_idx]
            uuid2ans[uuid] = detokenizer.detokenize(pred_ans_tokens,
                                                    return_str=True)

        batch_num += 1

        if batch_num % 10 == 0:
            print("Generated answers for %i/%i batches = %.2f%%" %
                  (batch_num, num_batches, batch_num * 100.0 / num_batches))

    print("Finished generating answers for dataset.")

    return uuid2ans
Exemplo n.º 2
0
def main(args):
    detok = MosesDetokenizer("fi")
    with open(args.infile, "r") as fi, open(args.outfile, "w") as fo:
        for line in fi:
            data = json.loads(line.strip())
            data["text"] = html.unescape(detok(data["text"].split()))
            fo.write(json.dumps(data, ensure_ascii=False) + "\n")
Exemplo n.º 3
0
def clean_txt(filename):
  file = open(filename, 'rt')
  text = file.read()
  file.close()
  # split into words by white space
  tokens = word_tokenize(text)

  with MosesDetokenizer('en') as detokenize:
    detokenize(tokens)

  text_string = " ".join(str(x) for x in tokens)
  # detokenizer = MosesDetokenizer(lang='en')

  # detokenizer.detokenize(tokens)


  # stemming of words
  # snowball = SnowballStemmer("english", ignore_stopwords=True)
  # stemmed = [snowball.stem(word) for word in tokens]
  # print('oink oink im a pig !!!!!!!!!!!!!!!!!!!!!!')
  # print(tokens[:100])
  print(type(text_string))
  print(text_string)
  # dates = re.findall(r'[A-Z][a-z]{1,8}\s\d{1,3}([a-z]{1,3})?,\s\d{2,4}', text_string)
  dates = re.findall(r"[a-zA-Z]{4,9} \d+", text_string)

  print(dates)
    def __init__(self,
                 server,
                 servable_name,
                 t2t_usr_dir,
                 problem,
                 data_dir,
                 timeout_secs):
        super(EnZhNmtClient).__init__()
        tf.logging.set_verbosity(tf.logging.INFO)
        validate_flags(server, servable_name)
        usr_dir.import_usr_dir(t2t_usr_dir)
        self.problem = registry.problem(problem)
        self.hparams = tf.contrib.training.HParams(
            data_dir=os.path.expanduser(data_dir))
        self.problem.get_hparams(self.hparams)
        self.request_fn = make_request_fn(server, servable_name, timeout_secs)
        self.moses_tokenizer = MosesTokenizer('en')
        self.moses_detokenizer = MosesDetokenizer('zh')
        if problem.endswith("_rev"):
            fname = "targets"
        else:
            fname = "inputs" if self.problem.has_inputs else "targets"
        self.input_encoder = self.problem.feature_info[fname].encoder

        if problem.endswith("_rev"):
            self.output_decoder = self.problem.feature_info["inputs"].encoder
        else:
            self.output_decoder = self.problem.feature_info["targets"].encoder
Exemplo n.º 5
0
def add_trade_mark_sign(string):
    data = [
        word + '\N{TRADE MARK SIGN}' if len(word) > 5 else word
        for sent in sent_tokenize(string) for word in word_tokenize(sent)
    ]
    with MosesDetokenizer() as detokenize:
        res = detokenize(data)
    return res
Exemplo n.º 6
0
 def detok_copy(self) -> Any:
     logging.warn("Creating a Moses-detokenized copy of this corpus!")
     detokenize_inner = MosesDetokenizer('en')
     detokenize = lambda x: detokenize_inner(x.split())
     corpus = Corpus()
     for utt_id, sent in self.items():
         corpus[utt_id] = detokenize(sent)
     return corpus
Exemplo n.º 7
0
    def get_detokenized_target(self, trg, batch_size):
        targets = []
        with MosesDetokenizer(self.trg_lang) as detok:
            for i in range(batch_size):
                t = self.tokenizer.detokenize(trg[:, i].tolist())
                t = detok(t.split())
                targets.append(t)

        return targets
Exemplo n.º 8
0
def main(system_outputs_folder, clustered_outputs_folder, output_file):
    random.seed(37)
    detokenize = MosesDetokenizer('en')
    inputs, preds, scores, systems = load_directory(system_outputs_folder,
                                                    clustered_outputs_folder,
                                                    detokenize)
    rows = make_rows(inputs, preds, scores, systems)

    output_csv(rows, output_file)
Exemplo n.º 9
0
 def __init__(self):
     tf.logging.set_verbosity(tf.logging.INFO)
     validate_flags()
     usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
     self.problem = registry.problem(FLAGS.problem)
     self.hparams = tf.contrib.training.HParams(
         data_dir=os.path.expanduser(FLAGS.data_dir))
     self.problem.get_hparams(self.hparams)
     self.request_fn = make_request_fn()
     self.mose_detokenizer = MosesDetokenizer('en')
Exemplo n.º 10
0
def main(opt):
    if not os.path.exists(opt.output_dir):
        os.makedirs(opt.output_dir)

    bc = BertClient()
    detokenize = MosesDetokenizer('en')

    all_results = {}
    for json_file in glob.glob(os.path.join(opt.input_dir, '*.json')):
        out_json_file = os.path.join(opt.output_dir,
                                     os.path.basename(json_file))

        ## Check to make sure file doesn't already exist
        if not os.path.isfile(out_json_file):
            with open(json_file, 'r') as f:
                try:
                    experiment = json.load(f)
                    print('Processing ' + json_file)
                except:
                    print('Error processing ' + json_file)
                    print('Skipping it.')
                    continue

                for ex_num, example in enumerate(experiment):
                    if ex_num % 10 == 0:
                        print("Clustering output: " + str(ex_num))

                    candidates = example['pred']
                    scores = example['scores']
                    candidates, scores = remove_duplicates(candidates, scores)

                    if opt.method == 'kmeans':
                        candidates, scores = kmeans_filtering(
                            candidates, scores, opt.num_cands, True, bc,
                            detokenize)
                    elif opt.method == 'distance':
                        candidates, scores = distance_filtering(
                            candidates, scores, opt.num_cands, False, bc,
                            detokenize)
                    elif opt.method == 'kmeans_mod':
                        candidates, scores = kmeans_mod_filtering(
                            candidates, scores, opt.num_cands, True, bc,
                            detokenize)
                    else:
                        raise ValueError('Not a valid filtering method')

                    example['pred'] = candidates
                    example['scores'] = scores

            out_json_file = os.path.join(opt.output_dir,
                                         os.path.basename(json_file))
            with open(out_json_file, 'w') as f:
                json.dump(experiment, f)
        else:
            print("SKIPPING: " + json_file)
Exemplo n.º 11
0
    def __init__(self, pos, config, config_global, logger,
                 entity_linker_name: str):
        super(BaseEntityLinkingStage, self).__init__(pos, config,
                                                     config_global, logger)

        self._entity_linker_name = entity_linker_name
        self._entity_linker_cache = self._provide_cache(
            self._entity_linker_name, human_readable=False)

        # note: we are not using NLTK TreebankWordDetokenizer here, because that one replaces double quotes with two
        # single quotes which makes mappings between the tokenized and detokenized strings needlessly complicated
        self._detokenizer = MosesDetokenizer("en")
    def __init__(self, pos, config, config_global, logger):
        super(SentenceBertEmbeddingFeaturePreparationStage,
              self).__init__(pos, config, config_global, logger)

        self._pretrained_model_name = config["pretrained_model_name"]

        self._cache = self._provide_cache("sentence_bert",
                                          bind_parameters=config)

        # note: we are not using NLTK TreebankWordDetokenizer here, because that one replaces double quotes with two
        # single quotes which makes mappings between the tokenized and detokenized strings needlessly complicated
        self._detokenizer = MosesDetokenizer("en")
 def __init__(self):
     tf.logging.set_verbosity(tf.logging.INFO)
     validate_flags()
     usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
     self.problem = registry.problem(FLAGS.problem)
     self.hparams = tf.contrib.training.HParams(
         data_dir=os.path.expanduser(FLAGS.data_dir))
     self.problem.get_hparams(self.hparams)
     self.request_fn = make_request_fn()
     self.tokenizer = MosesTokenizer('en')
     self.moses_detokenizer = MosesDetokenizer('zh')
     self.delimiter = re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s")
Exemplo n.º 14
0
def make_human_readable(files: list, detokenize: bool):
    if detokenize:
        detokenizer = MosesDetokenizer("en")
    print("Postprocessing on {} files...".format(len(files)))
    for file in files:
        print("Working on: {}".format(file))
        with open(file, "r") as fin, open(file+".human_readable", "w") as fout:
            for line in fin:
                cleanline = strip_chars(line)
                if detokenize:
                    cleanline = detokenizer(cleanline.strip().split())
                fout.write("{}\n".format(cleanline))
Exemplo n.º 15
0
def main(test_file, output_file, lang):
    lang = lang.split('_')[0]
    # detokenizer = MosesDetokenizer(lang)
    detokenize = MosesDetokenizer(lang)
    sent_id = 1
    with open(output_file, 'w+') as out:
        for line in open(test_file):
            if line.startswith('# text ='):
                line = line.split('=', 1)[1].strip()
                words = line.split()
                out_text = detokenize(words)
                out.write(f'# sent_id = {sent_id}\n# text = {out_text}\n\n')
                sent_id += 1
Exemplo n.º 16
0
 def moses_detokenize(self,
                      inp: Path,
                      out: Path,
                      col=0,
                      lang='en',
                      post_op=None):
     log.info(f"detok : {inp} --> {out}")
     tok_lines = IO.get_lines(inp, col=col, line_mapper=lambda x: x.split())
     with MosesDetokenizer(lang=lang) as detok:
         detok_lines = (detok(tok_line) for tok_line in tok_lines)
         if post_op:
             detok_lines = (post_op(line) for line in detok_lines)
         IO.write_lines(out, detok_lines)
Exemplo n.º 17
0
def main(args):
    splits = MosesSentenceSplitter('fi')
    detok = MosesDetokenizer("fi")
    with open(args.infile, "r") as fi, open(args.outfile, "w") as fo:
        for line in fi:
            data = json.loads(line.strip())
            text = html.unescape(detok(data["text"].split())) if args.moses_tokenized else data["text"]
            sents = splits([text])
            for i, s in enumerate(sents):
                d = data.copy()
                d["text"] = s
                if "id" in d.keys():
                    d["id"] = d["id"] + f"-s{i}"

                fo.write(json.dumps(d, ensure_ascii=False) + "\n")
Exemplo n.º 18
0
class MosesTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
        self._tokenizer = NLTKMosesTokenizer()
        self._detokenizer = MosesDetokenizer()

    def tokenize(self, sentence):
        return self._tokenizer.tokenize(sentence)

    def detokenize(self, tokens):
        """Unescape Moses punctuation tokens.

        Replaces escape sequences like &#91; with the original characters
        (such as '['), so they better align to the original text.
        """
        return [self._detokenizer.unescape_xml(t) for t in tokens]
Exemplo n.º 19
0
def main(system_outputs_folder, clustered_outputs_folder, outputs_folder_100,
         input_file, gold_output_file, output_file):
    random.seed(37)
    detokenize = MosesDetokenizer('en')
    ## Gets predicted responses from all systems
    inputs, preds, scores, systems = load_directory(system_outputs_folder,
                                                    clustered_outputs_folder,
                                                    outputs_folder_100,
                                                    detokenize)

    ## Gets gold responses
    gold_dict = get_gold_responses(input_file, gold_output_file, detokenize)

    ## Formats responses into rows for MTurk experiment
    rows = make_rows(inputs, preds, scores, systems, gold_dict)

    output_csv(rows, output_file)
Exemplo n.º 20
0
    def __init__(self,
                 srclang,
                 targetlang,
                 sourcebpe=None,
                 targetbpe=None,
                 sourcespm=None,
                 targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences = []
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)

        if self.bpe_source:
            self.detokenizer = MosesDetokenizer(targetlang)
Exemplo n.º 21
0
    def translate(self, src, trg):
        """Given a source a target tokenized tensors, outputs the
        non-tokenized translation from the model, as well as the non-tokenized target

        Args:
            src:
            trg:

        Returns:

        """
        src, src_len = src
        trg, trg_len = trg
        device = next(self.model.parameters()).device

        batch_size = src.shape[1]

        bos = [self.insert_target_start] * (batch_size * self.beam_size)
        bos = torch.tensor(bos, dtype=torch.int64, device=device).view(1, -1)

        if self.beam_size == 1:
            generator = self.generator.greedy_search
        else:
            generator = self.generator.beam_search

        with torch.no_grad():
            context = self.model.encode(src, src_len)
            context = [context, src_len, None]
            preds, lengths, counter = generator(batch_size, bos, context)

        preds = preds.cpu()
        targets = self.get_detokenized_target(trg, batch_size)

        output = []
        with MosesDetokenizer(self.trg_lang) as detokenizer:
            for pred in preds:
                pred = pred.tolist()
                detok = self.tokenizer.detokenize(pred)
                detok = detokenizer(detok.split())
                output.append(detok)

        return output, targets
Exemplo n.º 22
0
 def stopwords_e_pontuacao(self, instancia):
     ## tokenizar com nltk
     instancia = instancia.split()
     ### remove punctuation from each word
     table = str.maketrans('', '', string.punctuation)
     instancia = [w.translate(table) for w in instancia]
     ### convert to lower case and  remove everything that is not alphabetic
     instancia = [word for word in instancia if word.isalpha()]
     ## filter out StopWords
     stopwords = nltk.corpus.stopwords.words('portuguese') + [
         'aqui', 'a', 'rs', 'é', '/', 'fdp', '%', 'pfvr', 'cadê', 'né', 'q',
         'pq', '#', '@', 'mt', 'youtube', 'hj', 'dnv', 'mto', 'vc', 'eh',
         'r$', 'rt', 'via', 'vía'
     ]
     stopwords.remove("não")
     instancia = [w for w in instancia if not w in stopwords]
     ## detokenizer (necessary to pass as arg to make an textblob object)
     with MosesDetokenizer('pt') as detokenize:
         instancia = detokenize(instancia)
     return instancia
Exemplo n.º 23
0
def detokenize(wordsOrSentences, joinSentences=True, logger=None, verbose=True):
    global multiReplacerSingleton
    wordsOrSentences = copy.deepcopy(wordsOrSentences)
    words = wordsOrSentences
    if multiReplacerSingleton is None:
        repls = \
        {
            # " ,": ",",
            # " .": ".",
            # " ?": "?",
            # " !": "!",
            # " )": ")",
            # "( ": "(",
            # " :": ":",
            # " '": "'",
            " n't": "n't",
        }
        multiReplacerSingleton = MultiReplacer(repls)
    with MosesDetokenizer('en') as detokenizeSingleton:
        def __detokenizeWords(words):
            text = detokenizeSingleton(words)
            text = multiReplacerSingleton.replace(text)
            return text
        if words is None or len(words) == 0:
            return ""
        if isinstance(words[0], list):
            sentences = words
            for i in range(len(sentences)):
                words = sentences[i]
                text = __detokenizeWords(words)
                sentences[i] = text
            if joinSentences:
                return "\n".join(sentences)
            else:
                return sentences
        elif isinstance(words[0], str):
            return __detokenizeWords(words)
        else:
            logError("words[0] must be either a list (so words are sentences)", logger, verbose=verbose)
            return None
def get_embs(candidates, normalize=False):
    """Returns the sequence embedding for each candidate."""

    bc = BertClient()
    detokenize = MosesDetokenizer('en')

    detoked_cands = []
    for i, cand in enumerate(candidates):
        detoked_cands.append(detokenize(cand))
        if len(detokenize(cand)) == 0:
            print(i)
            print(cand)
            print(detokenize(cand))

    embs = bc.encode(detoked_cands)
    if normalize:
        embs = [e / np.linalg.norm(e) for e in embs]

    # This line is necessary depending on how the BERT server is setup.
    # embs = [np.mean(emb, 0) for emb in embs]

    return embs
Exemplo n.º 25
0
def random_inflect(source: str,
                   inflection_counts: Dict[str, int] = None) -> str:
    have_inflections = {'NOUN', 'VERB', 'ADJ'}
    tokenized = MosesTokenizer(lang='en').tokenize(
        source)  # Tokenize the sentence
    upper = False
    if tokenized[0][0].isupper():
        upper = True
        tokenized[0] = tokenized[0].lower()

    pos_tagged = nltk.pos_tag(tokenized,
                              tagset='universal')  # POS tag words in sentence

    for i, word in enumerate(tokenized):
        lemmas = lemminflect.getAllLemmas(word)
        # Only operate on content words (nouns/verbs/adjectives)
        if lemmas and pos_tagged[i][1] in have_inflections and pos_tagged[i][
                1] in lemmas:
            lemma = lemmas[pos_tagged[i][1]][0]
            inflections = (i, [(tag, infl)
                               for tag, tup in lemminflect.getAllInflections(
                                   lemma, upos=pos_tagged[i][1]).items()
                               for infl in tup])
            if inflections[1]:
                # Use inflection distribution for weighted random sampling if specified
                # Otherwise unweighted
                if inflection_counts:
                    counts = [
                        inflection_counts[tag] for tag, infl in inflections[1]
                    ]
                    inflection = random.choices(inflections[1],
                                                weights=counts)[0][1]
                else:
                    inflection = random.choices(inflections[1])[0][1]
                tokenized[i] = inflection
    if upper:
        tokenized[0] = tokenized[0].title()
    return MosesDetokenizer(lang='en').detokenize(tokenized)
Exemplo n.º 26
0
import torch
from allennlp.common.checks import ConfigurationError
from allennlp.common.params import Params
from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder
from allennlp.nn.util import device_mapping, masked_softmax
from mosestokenizer import MosesDetokenizer
from torch.autograd import Variable
from torch.nn import Dropout, Linear, Parameter, init

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

SOS_TOK, EOS_TOK = "<SOS>", "<EOS>"

# Note: using the full 'detokenize()' method is not recommended, since it does
# a poor job of adding correct whitespace. Use unescape_xml() only.
_MOSES_DETOKENIZER = MosesDetokenizer()


def copy_iter(elems):
    """Simple iterator yielding copies of elements."""
    for elem in elems:
        yield copy.deepcopy(elem)


def wrap_singleton_string(item: Union[Sequence, str]):
    """ Wrap a single string as a list. """
    if isinstance(item, str):
        # Can't check if iterable, because a string is an iterable of
        # characters, which is not what we want.
        return [item]
    return item
Exemplo n.º 27
0
def main(opt):

    bc = BertClient()
    detokenize = MosesDetokenizer('en')

    all_results = {}
    for json_file in glob.glob(os.path.join(opt.dir, '*.json')):
        with open(json_file, 'r') as f:
            try:
                experiment = json.load(f)
                print('Processing ' + json_file)
            except:
                print('Error processing ' + json_file)
                print('Skipping it.')

            exp_name = os.path.basename(json_file).replace('.json', '')

            eval_results = []

            for example in experiment['results']:
                candidates = example['pred']

                ex_results = {}
                ex_results['dist_from_mean_emb'] = eval_emb_stats(
                    candidates, bc, detokenize)
                ex_results['num_distinct_1grams'] = eval_distinct_k(
                    candidates, 1)
                ex_results['num_distinct_2grams'] = eval_distinct_k(
                    candidates, 2)
                ex_results['entropy_2grams'] = eval_entropy(candidates, 2)
                ex_results['entropy_4grams'] = eval_entropy(candidates, 4)
                min_edit, mean_edit, max_edit = eval_edit_distance(candidates)
                ex_results['min_edit_distance'] = min_edit
                ex_results['mean_edit_distance'] = mean_edit
                ex_results['max_edit_distance'] = max_edit
                eval_results.append(ex_results)

        all_results[exp_name] = {
            'ex_results': eval_results,
            'perplexity': experiment['ppl'],
            'score': experiment['score']
        }

    per_experiment_keys = ['perplexity', 'score']
    per_example_keys = list(all_results[exp_name]['ex_results'][0].keys())

    outfile = os.path.join(opt.dir, 'results.csv')
    with open(outfile, 'w') as csv_file:
        fieldnames = ['exp'] + per_experiment_keys + per_example_keys
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for exp_name, results in all_results.items():
            csv_line = {'exp': exp_name}

            for key in per_experiment_keys:
                csv_line[key] = results[key]
            for key in per_example_keys:
                csv_line[key] = np.mean([
                    r[key] for r in results['ex_results'] if r[key] != np.nan
                ])

            writer.writerow(csv_line)
    print('Evaluation results written to %s' % outfile)
Exemplo n.º 28
0
    def __init__(self, system_id, apply_disc=False):
        super().__init__(system_id, apply_disc=apply_disc)

        self.keyword_model = os.environ.get(
            "CWC_KEYWORD_MODEL_" + system_id.upper(), self.model_folder +
            "/ROC_title_keyword_e500_h1000_edr0.4_hdr0.1_511_lr10.pt")
        self.keyword_vocab = os.environ.get(
            "CWC_KEYWORD_VOCAB_" + system_id.upper(), self.model_folder +
            "/ROC_title_keyword_e500_h1000_edr0.4_hdr0.1_511_lr10.pkl")
        self.story_model = os.environ.get(
            "CWC_STORY_MODEL_" + system_id.upper(), self.model_folder +
            "/ROC_title_key_story_e1000_h1500_edr0.2_hdr0.1_511_lr10.pt")
        self.story_vocab = os.environ.get(
            "CWC_STORY_VOCAB_" + system_id.upper(), self.model_folder +
            "/ROC_title_key_story_e1000_h1500_edr0.2_hdr0.1_511_lr10.pkl")
        self.scorers_config = os.environ.get(
            "CWC_SCORERS_CONFIG_" + system_id.upper(),
            self.model_folder + "/scorer_weights_abl.tsv")
        self.gold_titles = os.environ.get(
            "CWC_GOLD_TITLES_" + system_id.upper(), self.data_folder +
            "/ROCStories_all_merge_tokenize.titlesepkey.all")

        torch.manual_seed(self.torch_seed)

        # Load models and vocab dictionaries, init stopping symbols for generation
        self.kw_model = load_model(self.keyword_model, self.use_cuda)
        self.st_model = load_model(self.story_model, self.use_cuda)
        self.kw_dict = load_pickle(self.keyword_vocab)
        self.st_dict = load_pickle(self.story_vocab)
        self.kw_vocab_size = len(self.kw_dict)
        self.st_vocab_size = len(self.st_dict)
        self.st_eos_id = self.st_dict.word2idx[self.story_end]
        self.st_unk_id = self.st_dict.word2idx[self.story_unk]
        #self.kw_eos_id = self.kw_dict.word2idx[self.story_end] this is clearly wrong but seems to not be used ever
        self.kw_eot_id = self.kw_dict.word2idx[self.title_end]
        self.kw_end_id = self.kw_dict.word2idx[self.kw_end]
        self.kw_sep_id = self.kw_dict.word2idx[self.kw_sep]
        self.st_sep_id = self.st_dict.word2idx[self.story_sep]
        # self.special_chars = [self.kw_end, self.story_end, self.kw_sep, self.story_sep, self.title_end]
        self.title2storyline = read_gold_storylines(self.gold_titles,
                                                    self.title_end)
        self.special_chars = SPECIAL_CHARACTERS
        self.nlp = init_nlp_model()
        self.detokenizer = MosesDetokenizer('en')

        if self.apply_disc:
            print("%s: Using BeamRerankDecoder" % (self.system_id))
            scorers, coefs = read_scorers(self.scorers_config, self.use_cuda)
            self.decoder = BeamRerankDecoder(
                self.st_model,
                scorers,
                coefs,
                beam_size=self.beam_size,
                sep=self.st_sep_id,
                temperature=None,
                terms=[self.st_eos_id],
                forbidden=[self.st_unk_id, self.st_eos_id],
                use_cuda=self.use_cuda)
        else:
            print("%s: Using BeamSearchDecoder" % (self.system_id))
            self.decoder = BeamSearchDecoder(self.st_model,
                                             self.beam_size,
                                             self.st_eos_id,
                                             verbosity=False,
                                             dictionary=self.st_dict,
                                             sep=self.st_sep_id)
Exemplo n.º 29
0
    parser = argparse.ArgumentParser()

    parser.add_argument('split_directory', type=str)

    args = parser.parse_args()

    file_paths = glob.glob(os.path.join(args.split_directory, '*.test.csv'))
    assert len(file_paths) == 1

    test_filepath = file_paths[0]

    print(f'Read csv file from {test_filepath}')
    test_df = pd.read_csv(test_filepath, encoding='utf-8')

    en_tokenizer = MosesTokenizer()
    en_detokenizer = MosesDetokenizer()

    test_df['en'] = test_df['en'].apply(
        lambda x: en_detokenizer(en_tokenizer(x)))
    test_df['th'] = test_df['th'].apply(
        lambda x: ' '.join(th_word_space_tokenize(x))).apply(th_detokenize)

    test_df[['en']].to_csv(os.path.join(args.split_directory, 'test.detok.en'),
                           encoding='utf-8',
                           sep="\t",
                           index=False,
                           header=False,
                           escapechar="",
                           quotechar="",
                           quoting=csv.QUOTE_NONE)
    test_df[['th']].to_csv(os.path.join(args.split_directory, 'test.detok.th'),
Exemplo n.º 30
0
 def __init__(self, dependency, nlp):
     self.detokenizer = MosesDetokenizer('en')
     self.dependency = dependency
     self.nlp = nlp