示例#1
0
def wordUpdate():
    data = request.get_json()
    sentence = data["sentence"]
    attentionOverrideMap = data["attentionOverrideMap"]
    correctionMap = data["correctionMap"]
    unk_map = data["unk_map"]
    beam_size = int(data["beam_size"])
    beam_length = float(data["beam_length"])
    beam_coverage = float(data["beam_coverage"])

    translation, attn, translations = seq2seq_model.translate(
        sentence,
        beam_size,
        beam_length=beam_length,
        beam_coverage=beam_coverage,
        attention_override_map=attentionOverrideMap,
        correction_map=correctionMap,
        unk_map=unk_map)
    beam = translationsToTree(translations)
    res = {}
    res["beam"] = beam

    return jsonify(res)
示例#2
0
def translate():
    data = request.get_json()
    sentence = data["sentence"]
    beam_size = int(data["beam_size"])
    beam_length = float(data["beam_length"])
    beam_coverage = float(data["beam_coverage"])

    translation, attn, translations = seq2seq_model.translate(
        sentence,
        beam_size,
        beam_length=beam_length,
        beam_coverage=beam_coverage,
        apply_bpe=False)

    res = {}
    res["sentence"] = sentence
    res["translation"] = " ".join(translation)
    res["attention"] = attn

    beam = translationsToTree(translations)
    res["beam"] = beam

    return jsonify(res)
示例#3
0
def retranslate(document_id):
    document = get_document(document_id)
    scorer = Scorer()
    extractor = DomainSpecificExtractor(
        source_file=document.filepath,
        train_source_file="myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de",
        train_vocab_file="myseq2seq/train_vocab.pkl")
    keyphrases = extractor.extract_keyphrases()
    num_changes = 0

    for i, sentence in enumerate(document.sentences):
        if sentence.corrected:
            continue

        translation, attn, translations = seq2seq_model.translate(
            sentence.source)

        beam = translationsToTree(translations)

        score = scorer.compute_scores(sentence.source, " ".join(translation),
                                      attn, keyphrases)
        score["order_id"] = i

        translation_text = " ".join(translation)
        if translation_text != sentence.translation:
            num_changes += 1
            sentence.diff = html_diff(
                sentence.translation[:-4].replace("@@ ", ""),
                translation_text[:-4].replace("@@ ", ""))
        sentence.translation = translation_text
        sentence.beam = beam
        sentence.score = score
        sentence.attention = attn

    save_document(document, document_id)
    return jsonify({"numChanges": num_changes})
示例#4
0
def documentUpload():
    if 'file' not in request.files:
        return redirect(request.url)
    file = request.files['file']
    # if user does not select file, browser also
    # submit an empty part without filename
    if file.filename == '':
        return redirect(request.url)
    if file and allowed_file(file.filename):
        document_name = request.args.get("document_name")
        id = uuid4()
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        user = User.query.filter_by(username=get_jwt_identity()).first()
        dbDocument = DBDocument(id=id, name=document_name, user=user)

        document = Document(str(id), document_name, dict(), filepath)
        sentences = document.load_content(filename)

        with open(filepath, "w") as f:
            for i, sentence in enumerate(sentences):
                f.write(
                    sentence.replace("@@ ", "") +
                    "\n" if i < len(sentences) - 1 else "")

        extractor = DomainSpecificExtractor(
            source_file=filepath,
            train_source_file=
            "myseq2seq/data/wmt14/train.tok.clean.bpe.32000.de",
            train_vocab_file="myseq2seq/train_vocab.pkl")
        keyphrases = extractor.extract_keyphrases(n_results=30)

        scorer = Scorer()

        print("Translating {} sentences".format(len(sentences)))

        for i, source in enumerate(sentences):
            translation, attn, translations = seq2seq_model.translate(
                source, beam_size=3, beam_length=1, beam_coverage=1)

            print("Translated {} of {}".format(i + 1, len(sentences)))

            beam = translationsToTree(translations[:3])

            score = scorer.compute_scores(source, " ".join(translation), attn,
                                          keyphrases)
            score["order_id"] = i
            sentence = Sentence(i, source, " ".join(translation), attn, beam,
                                score)

            document.sentences.append(sentence)

        print("Finished translation")

        keyphrases = [{
            "name": k,
            "occurrences": f,
            "active": False
        } for (k, f) in keyphrases]
        document.keyphrases = keyphrases
        db.session.add(dbDocument)
        db.session.commit()

        save_document(document, id)

        return jsonify({})
    return jsonify({})