Exemplo n.º 1
0
    def correct_sentences(self, blast_filename, en_filename, pt_filename):
        blast_reader = BlastReader(blast_filename)
        en, pt = load_embeddings(en_filename, pt_filename)

        self.progress_bar.grid(row=4, column=0, columnspan=3, padx=(20, 0))
        self.progress_bar['maximum'] = len(blast_reader.sys_lines)
        self.progress_var.set(0)
        self.cancel_button.config(state=tk.ACTIVE)

        file_content = ''
        for i, sent in enumerate(blast_reader.sys_lines):
            self.progress_var.set(i)

            if self.stop:
                break

            errors = blast_reader.get_error_messages(i)
            errors = [e for e in errors if e[-1] in self.app.errors]
            if errors:
                for error in errors:
                    if self.stop:
                        break

                    words_to_ignore = list()
                    if error[0][0] >= 0:
                        for index in error[1]:
                            words_to_ignore.append(sent[index])
                            del sent[index]
                        suggestions = closest_words(
                            blast_reader.src_lines[i][error[0][0]], en, pt,
                            words_to_ignore)
                        sent.insert(error[1][0], suggestions[0][0])
            file_content += ' '.join(sent)
            file_content += '\n'
        self.progress_bar.stop()
        self.progress_bar.grid_forget()
        self.done_button.config(state=tk.ACTIVE)
        self.blast_path_button.config(state=tk.ACTIVE)
        self.en_path_button.config(state=tk.ACTIVE)
        self.pt_path_button.config(state=tk.ACTIVE)
        self.can_close = True

        if not self.stop:
            save_filaname = os.path.splitext(
                blast_filename)[0] + '-corrected.txt'
            with open(save_filaname, 'w') as _file:
                _file.write(file_content)

            msgb.showinfo(_('Saved'), _('File saved as: ') + save_filaname)
Exemplo n.º 2
0
    def get_statistics(self):
        '''Read BLAST file and get statistics for each error type'''
        try:
            blast_reader = BlastReader(self.blast_file_path.name)
        except FileNotFoundError:
            tk.messagebox.showerror(_('File not found'),
                                    _('BLAST file not found.'))
        except AttributeError:
            pass
        else:
            total_errors = len(
                blast_reader.get_filtered_errors(self.error_types.keys()))

            for _type in self.error_types:
                error_occurences = len(
                    blast_reader.get_filtered_errors([_type]))
                error_percentage = (error_occurences / total_errors) * 100
                self.error_stats_values[_type] = (
                    error_occurences, '{0:.2f}'.format(error_percentage))
Exemplo n.º 3
0
    def load_blast(self):
        '''
        Loads BLAST file.
        Filters the errors by the one selected by the user.
        '''
        blast_path = self.blast_path_text.get('1.0', tk.END).strip()

        try:
            assert blast_path
        except AssertionError:
            tk.messagebox.showerror(_('Select files'),
                                    _('It is necessary to select all files.'))
        else:
            try:
                blast_reader = BlastReader(blast_path)
            except FileNotFoundError:
                tk.messagebox.showerror(_('File not found'),
                                        _('BLAST file not found.'))
            else:
                errors = blast_reader.get_filtered_errors(
                    [self.error_type.get()])

                self.filename = os.path.splitext(os.path.split(
                    blast_path)[1])[0] + '_APE_' + self.error_type.get()

                # Progress bar to track the APE process
                progress_var = tk.DoubleVar()
                self.progress_bar = ttk.Progressbar(self.blast_window,
                                                    variable=progress_var,
                                                    maximum=len(errors))
                self.cancel_ape_button.config(state=tk.NORMAL)
                self.cancel_ape_button.grid(row=5,
                                            column=0,
                                            columnspan=3,
                                            pady=10)
                self.progress_bar.grid(row=4, column=0, columnspan=3, pady=10)

                # Post Editing Thread
                self.running_threads.append(
                    PostEditor(self, blast_reader, progress_var))
                self.blast_window.after(100, self.ape_queue_callback)
def calcula_medidas():
    blast_reader_test = BlastReader(BLAST_PATH_TEST)
    blast_reader_ref = BlastReader(BLAST_PATH_REF)

    verdadeiro_positivo = 0
    falso_positivo = 0
    for (sent_idx_test, error) in blast_reader_test.error_lines:
        if SENT_LIMIT is None or (SENT_LIMIT is not None
                                  and sent_idx_test < SENT_LIMIT):
            error_type = error[-1]
            sys_idxs = error[1]

            fp = 1
            for (sent_idx_ref,
                 error2) in blast_reader_ref.get_filtered_errors([error_type]):
                if sent_idx_ref == sent_idx_test:
                    if set(sys_idxs) & set(error2[1]):
                        verdadeiro_positivo += 1
                        fp = 0
                        break
            falso_positivo += fp

    falso_negativo = 0
    for (idx, error) in blast_reader_ref.error_lines:
        if SENT_LIMIT is None or (SENT_LIMIT is not None
                                  and sent_idx_test < SENT_LIMIT):
            error_type = error[-1]
            sys_idxs = error[1]

            fn = 1
            for (idx2,
                 error2) in blast_reader_test.get_filtered_errors([error_type
                                                                   ]):
                if idx2 == idx:
                    if set(sys_idxs) & set(error2[1]):
                        fn = 0
                        break
            falso_negativo += fn

    precisao = verdadeiro_positivo / (verdadeiro_positivo + falso_positivo)
    cobertura = verdadeiro_positivo / (verdadeiro_positivo + falso_negativo)

    print('Precisao: {:.2f}%'.format(precisao * 100))
    print('Cobertura: {:.2f}%'.format(cobertura * 100))
def calcula_matriz_confusao():
    blast_reader_test = BlastReader(BLAST_PATH_TEST)
    blast_reader_ref = BlastReader(BLAST_PATH_REF)

    indices_matriz = {x: TIPO_ERROS.index(x) for x in TIPO_ERROS}
    indices_matriz['correto'] = len(indices_matriz)

    matriz = numpy.zeros((len(TIPO_ERROS) + 1, len(TIPO_ERROS) + 1))

    for (sent_idx_test, sent) in enumerate(blast_reader_test.sys_lines):
        if SENT_LIMIT is None or (SENT_LIMIT is not None
                                  and sent_idx_test < SENT_LIMIT):
            sent_class = blast_reader_test.get_error_messages(sent_idx_test)
            sent_class_ref = blast_reader_ref.get_error_messages(sent_idx_test)

            for (palavra_idx, palavra) in enumerate(sent):
                # Classificacao
                palavra_erros = [x for x in sent_class if palavra_idx in x[1]]
                if palavra_erros:
                    idx_linha = indices_matriz[palavra_erros[0][-1]]
                else:
                    idx_linha = indices_matriz['correto']

                # Referencia
                palavra_ref = [
                    x for x in sent_class_ref
                    if palavra_idx in x[1] and x[-1] in TIPO_ERROS
                ]
                if palavra_ref:
                    idx_coluna = indices_matriz[palavra_ref[0][-1]]
                else:
                    idx_coluna = indices_matriz['correto']

                matriz[idx_linha, idx_coluna] += 1
    print(indices_matriz)
    matprint(matriz)
Exemplo n.º 6
0
def main():
    """Main function
    """
    blast_reader = BlastReader(BLAST_PATH)
    src_lines = list()
    sys_lines = list()
    target = list()

    # Files for GIZA
    src_file = open('/tmp/src.txt', 'w')
    sys_file = open('/tmp/sys.txt', 'w')

    # Correct sentences
    for i in blast_reader.get_correct_indices():
        src_lines.append(blast_reader.src_lines[i])
        sys_lines.append(blast_reader.sys_lines[i])
        target.append('correct')

        # Write files for GIZA
        src_file.write(' '.join(blast_reader.src_lines[i]))
        src_file.write('\n')
        sys_file.write(' '.join(blast_reader.sys_lines[i]))
        sys_file.write('\n')

    # Error lines
    errors = blast_reader.get_filtered_errors(ERRORS)
    # errors = blast_reader.error_lines
    for (line, error) in errors:
        src_lines.append(blast_reader.src_lines[line])
        sys_lines.append(blast_reader.sys_lines[line])
        target.append(error)

        src_file.write(' '.join(blast_reader.src_lines[line]))
        src_file.write('\n')
        sys_file.write(' '.join(blast_reader.sys_lines[line]))
        sys_file.write('\n')
    src_file.close()
    sys_file.close()

    # Tag sentences
    print('Tagging sentences', file=sys.stderr)
    tagged_lines = tag_sentences(src_lines, sys_lines)

    # Align sentences
    print('Aligning sentences', file=sys.stderr)
    application_path = str(os.path.abspath(os.path.curdir))
    proc = subprocess.Popen([application_path + '/src/aligner/align_sentences.sh',
                             '--srcpath', '/tmp/src.txt',
                             '--syspath', '/tmp/sys.txt'],
                            stdout=subprocess.PIPE)
    out = proc.communicate()
    num_sents = int(out[0])
    giza_reader = GIZAReader('/tmp/giza.output')
    alignments = giza_reader.aligned_lines[:num_sents]

    # Extract features
    print('Extracting features', file=sys.stderr)
    training_instances = list()
    ignored_instances = 0
    for (i, sent) in progressbar.progressbar(enumerate(tagged_lines)):
        features = extract_features(
            sent, alignments[i]['alignment'], TW_SZ, target[i])
        if features:
            training_instances.append(features)
        else:
            ignored_instances += 1
    print('Finalizado!', file=sys.stderr)
    print('Instancias ignoradas: {}'.format(ignored_instances), file=sys.stderr)

    print('Iniciando treinamento', file=sys.stderr)
    data = format_features(training_instances)
    test_correct_error(data)
    def train(self, blast_filename, model_type, error_types=None):
        blast_reader = BlastReader(blast_filename)
        src_lines = list()
        sys_lines = list()
        target = list()

        # Files for GIZA
        src_fd, src_filename = tempfile.mkstemp(text=True)
        sys_fd, sys_filename = tempfile.mkstemp(text=True)
        src_file = open(src_filename, 'w')
        sys_file = open(sys_filename, 'w')

        # Correct sentences
        for i in blast_reader.get_correct_indices():
            if self.stop:
                break
            src_lines.append(blast_reader.src_lines[i])
            sys_lines.append(blast_reader.sys_lines[i])
            target.append('correct')

            # Write files for GIZA
            src_file.write(' '.join(blast_reader.src_lines[i]))
            src_file.write('\n')
            sys_file.write(' '.join(blast_reader.sys_lines[i]))
            sys_file.write('\n')

        # Error sentences
        errors = blast_reader.get_filtered_errors(
            error_types) if error_types else blast_reader.error_lines
        for (line, error) in errors:
            if self.stop:
                break
            src_lines.append(blast_reader.src_lines[line])
            sys_lines.append(blast_reader.sys_lines[line])
            target.append(error)

            src_file.write(' '.join(blast_reader.src_lines[line]))
            src_file.write('\n')
            sys_file.write(' '.join(blast_reader.sys_lines[line]))
            sys_file.write('\n')
        src_file.close()
        sys_file.close()
        os.close(src_fd)
        os.close(sys_fd)

        # Tag sentences
        if not self.stop:
            tagged_lines = self.tag_sentences(src_lines, sys_lines)

        # Align sentences
        if not self.stop:
            alignments = self.align_sentences(src_filename, sys_filename)

        # Extract features
        training_instances = list()
        for (i, sent) in enumerate(tagged_lines):
            if self.stop:
                break
            features = self.extract_features(sent, alignments[i]['alignment'],
                                             self.tw_size, target[i])
            if features:
                training_instances.append(features)

        if not self.stop:
            data = self.format_features(training_instances)

        if not self.stop:
            self.features = list(data)
            self.model_step1 = self.train_model(data.copy(),
                                                model_type,
                                                step1=True)
            self.model_step2 = self.train_model(data.copy(),
                                                model_type,
                                                step1=False)
def main():
    """Main function
    """
    blast_reader = BlastReader(BLAST_PATH)
    src_lines = list()
    sys_lines = list()
    target = list()

    # Files for GIZA
    src_file = open('/tmp/src.txt', 'w')
    sys_file = open('/tmp/sys.txt', 'w')

    # Correct sentences
    for i in blast_reader.get_correct_indices():
        src_lines.append(blast_reader.src_lines[i])
        sys_lines.append(blast_reader.sys_lines[i])
        target.append('correct')

        # Write files for GIZA
        src_file.write(' '.join(blast_reader.src_lines[i]))
        src_file.write('\n')
        sys_file.write(' '.join(blast_reader.sys_lines[i]))
        sys_file.write('\n')

    # Error lines
    errors = blast_reader.get_filtered_errors(ERRORS)
    # errors = blast_reader.error_lines
    for (line, error) in errors:
        src_lines.append(blast_reader.src_lines[line])
        sys_lines.append(blast_reader.sys_lines[line])
        target.append(error)

        src_file.write(' '.join(blast_reader.src_lines[line]))
        src_file.write('\n')
        sys_file.write(' '.join(blast_reader.sys_lines[line]))
        sys_file.write('\n')
    src_file.close()
    sys_file.close()

    # Tag sentences
    print('Tagging sentences')
    tagged_lines = tag_sentences(src_lines, sys_lines)

    # Align sentences
    print('Aligning sentences')
    application_path = str(os.path.abspath(os.path.curdir))
    proc = subprocess.Popen([
        application_path + '/src/aligner/align_sentences.sh', '--srcpath',
        '/tmp/src.txt', '--syspath', '/tmp/sys.txt'
    ],
                            stdout=subprocess.PIPE)
    out = proc.communicate()
    num_sents = int(out[0])
    giza_reader = GIZAReader('/tmp/giza.output')
    alignments = giza_reader.aligned_lines[:num_sents]

    # Extract features
    print('Extracting features')
    training_instances = list()
    ignored_instances = 0
    for (i, sent) in progressbar.progressbar(enumerate(tagged_lines)):
        features = extract_features(sent, alignments[i]['alignment'], TW_SZ,
                                    target[i])
        if features:
            training_instances.append(features)
        else:
            ignored_instances += 1
    print('Finalizado!')
    print('Instancias ignoradas: {}'.format(ignored_instances))

    print('Iniciando treinamento')
    data = format_features(training_instances)

    print('Arvore de Decisao - GINI')
    test_two_steps(data, DecisionTreeClassifier())
    print('------------------------')
    print('Arvore de Decisao - Entropy')
    test_two_steps(data, DecisionTreeClassifier(criterion='entropy'))
    print('------------------------')
    print('SVM')
    test_two_steps(data, LinearSVC())
    print('------------------------')
    print('SVM - Crammer-Singer')
    test_two_steps(data, LinearSVC(multi_class='crammer_singer'))
    print('------------------------')
    print('Perceptron')
    test_two_steps(data, Perceptron(n_jobs=-1))
    print('------------------------')
    print('Random Forest - GINI')
    test_two_steps(data, RandomForestClassifier(n_estimators=10))
    print('------------------------')
    print('Random Forest - Entropy')
    test_two_steps(
        data, RandomForestClassifier(n_estimators=10, criterion='entropy'))
    print('------------------------')
    print('Naive Bayes')
    test_two_steps(data, BernoulliNB())
    print('------------------------')
#!/usr/bin/env python3
from readers.read_blast import BlastReader

NMT_PATH = '/home/marciolima/Documentos/Lalic/post-editing/src/error_identification/error-ident-NMT.txt'
PBSMT_PATH = '/home/marciolima/Documentos/Lalic/post-editing/src/error_identification/error-ident-PBSMT.txt'

nmt_br = BlastReader(NMT_PATH)

with open('error-ident-blast.txt', 'a') as _file:
    # Just copy PBSMT file
    with open(PBSMT_PATH) as pbsmt_file:
        for line in pbsmt_file:
            _file.write(line)

    # Get 300 first sentences from NMT file
    for i in range(300):
        _file.write(' '.join(nmt_br.src_lines[i]))
        _file.write('\n')
        _file.write(' '.join(nmt_br.ref_lines[i]))
        _file.write('\n')
        _file.write(' '.join(nmt_br.sys_lines[i]))
        _file.write('\n\n')

        error_message = ''
        for error in nmt_br.get_error_messages(i):
            error_indices = [
                ','.join(str(idx) for idx in indices) for indices in error[:-1]
            ]
            error_indices.append(error[-1])
            error_message += '#'.join(error_indices)
            error_message += ' '